Line data Source code
1 : /*
2 : * Famedly Matrix SDK
3 : * Copyright (C) 2021 Famedly GmbH
4 : *
5 : * This program is free software: you can redistribute it and/or modify
6 : * it under the terms of the GNU Affero General Public License as
7 : * published by the Free Software Foundation, either version 3 of the
8 : * License, or (at your option) any later version.
9 : *
10 : * This program is distributed in the hope that it will be useful,
11 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : * GNU Affero General Public License for more details.
14 : *
15 : * You should have received a copy of the GNU Affero General Public License
16 : * along with this program. If not, see <https://www.gnu.org/licenses/>.
17 : */
18 :
19 : import 'package:collection/collection.dart';
20 : import 'package:html/dom.dart';
21 : import 'package:html/parser.dart';
22 : import 'package:html_unescape/html_unescape.dart';
23 :
24 : class HtmlToText {
25 : /// Convert an HTML string to a pseudo-markdown plain text representation, with
26 : /// `data-mx-spoiler` spans redacted
27 4 : static String convert(String html) {
28 : // riot-web is notorious for creating bad reply fallback events from invalid messages which, if
29 : // not handled properly, can lead to impersonation. As such, we strip the entire `<mx-reply>` tags
30 : // here already, to prevent that from happening.
31 : // We do *not* do this in an AST and just with simple regex here, as riot-web tends to create
32 : // miss-matching tags, and this way we actually correctly identify what we want to strip and, well,
33 : // strip it.
34 4 : final renderHtml = html.replaceAll(
35 4 : RegExp(
36 : '<mx-reply>.*</mx-reply>',
37 : caseSensitive: false,
38 : multiLine: false,
39 : dotAll: true,
40 : ),
41 : '',
42 : );
43 :
44 4 : final opts = _ConvertOpts();
45 8 : var reply = _walkNode(opts, parseFragment(renderHtml));
46 8 : reply = reply.replaceAll(RegExp(r'\s*$', multiLine: false), '');
47 : return reply;
48 : }
49 :
50 2 : static String _parsePreContent(_ConvertOpts opts, Element node) {
51 2 : var text = node.innerHtml;
52 : final match =
53 2 : RegExp(r'^<code([^>]*)>', multiLine: false, caseSensitive: false)
54 2 : .firstMatch(text);
55 : if (match == null) {
56 4 : text = HtmlUnescape().convert(text);
57 2 : if (text.isNotEmpty) {
58 4 : if (text[0] != '\n') {
59 2 : text = '\n$text';
60 : }
61 8 : if (text[text.length - 1] != '\n') {
62 2 : text += '\n';
63 : }
64 : }
65 : return text;
66 : }
67 : // remove <code> opening tag
68 4 : text = text.substring(match.end);
69 : // remove the </code> closing tag
70 2 : text = text.replaceAll(
71 2 : RegExp(r'</code>$', multiLine: false, caseSensitive: false),
72 : '',
73 : );
74 4 : text = HtmlUnescape().convert(text);
75 2 : if (text.isNotEmpty) {
76 4 : if (text[0] != '\n') {
77 2 : text = '\n$text';
78 : }
79 8 : if (text[text.length - 1] != '\n') {
80 2 : text += '\n';
81 : }
82 : }
83 : final language =
84 2 : RegExp(r'language-(\w+)', multiLine: false, caseSensitive: false)
85 4 : .firstMatch(match.group(1)!);
86 : if (language != null) {
87 4 : text = language.group(1)! + text;
88 : }
89 : return text;
90 : }
91 :
92 2 : static String _parseBlockquoteContent(_ConvertOpts opts, Element node) {
93 2 : final msg = _walkChildNodes(opts, node);
94 12 : return '${msg.split('\n').map((s) => '> $s').join('\n')}\n';
95 : }
96 :
97 2 : static String _parseSpanContent(_ConvertOpts opts, Element node) {
98 2 : final content = _walkChildNodes(opts, node);
99 6 : if (node.attributes['data-mx-spoiler'] is String) {
100 4 : var spoiler = '█' * content.length;
101 4 : final reason = node.attributes['data-mx-spoiler'];
102 2 : if (reason != '') {
103 2 : spoiler = '($reason) $spoiler';
104 : }
105 : return spoiler;
106 : }
107 : return content;
108 : }
109 :
110 2 : static String _parseUlContent(_ConvertOpts opts, Element node) {
111 4 : opts.listDepth++;
112 4 : final entries = _listChildNodes(opts, node, {'li'});
113 4 : opts.listDepth--;
114 : final bulletPoint =
115 8 : _listBulletPoints[opts.listDepth % _listBulletPoints.length];
116 :
117 : return entries
118 2 : .map(
119 2 : (s) =>
120 14 : '${' ' * opts.listDepth}$bulletPoint ${s.replaceAll('\n', '\n${' ' * opts.listDepth} ')}',
121 : )
122 2 : .join('\n');
123 : }
124 :
125 2 : static String _parseOlContent(_ConvertOpts opts, Element node) {
126 4 : opts.listDepth++;
127 4 : final entries = _listChildNodes(opts, node, {'li'});
128 4 : opts.listDepth--;
129 4 : final startStr = node.attributes['start'];
130 2 : final start = (startStr is String &&
131 4 : RegExp(r'^[0-9]+$', multiLine: false).hasMatch(startStr))
132 2 : ? int.parse(startStr)
133 : : 1;
134 :
135 : return entries
136 2 : .mapIndexed(
137 2 : (index, s) =>
138 16 : '${' ' * opts.listDepth}${start + index}. ${s.replaceAll('\n', '\n${' ' * opts.listDepth} ')}',
139 : )
140 2 : .join('\n');
141 : }
142 :
143 : static const _listBulletPoints = <String>['●', '○', '■', '‣'];
144 :
145 2 : static List<String> _listChildNodes(
146 : _ConvertOpts opts,
147 : Element node, [
148 : Iterable<String>? types,
149 : ]) {
150 2 : final replies = <String>[];
151 4 : for (final child in node.nodes) {
152 : if (types != null &&
153 2 : types.isNotEmpty &&
154 2 : ((child is Text) ||
155 2 : ((child is Element) &&
156 6 : !types.contains(child.localName!.toLowerCase())))) {
157 : continue;
158 : }
159 4 : replies.add(_walkNode(opts, child));
160 : }
161 : return replies;
162 : }
163 :
164 : static const _blockTags = <String>{
165 : 'blockquote',
166 : 'ul',
167 : 'ol',
168 : 'h1',
169 : 'h2',
170 : 'h3',
171 : 'h4',
172 : 'h5',
173 : 'h6',
174 : 'pre',
175 : };
176 :
177 4 : static String _walkChildNodes(_ConvertOpts opts, Node node) {
178 : var reply = '';
179 : var lastTag = '';
180 8 : for (final child in node.nodes) {
181 12 : final thisTag = child is Element ? child.localName!.toLowerCase() : '';
182 8 : if (thisTag == 'p' && lastTag == 'p') {
183 2 : reply += '\n\n';
184 4 : } else if (_blockTags.contains(thisTag) &&
185 4 : reply.isNotEmpty &&
186 8 : reply[reply.length - 1] != '\n') {
187 2 : reply += '\n';
188 : }
189 8 : reply += _walkNode(opts, child);
190 4 : if (thisTag.isNotEmpty) {
191 : lastTag = thisTag;
192 : }
193 : }
194 : return reply;
195 : }
196 :
197 4 : static String _walkNode(_ConvertOpts opts, Node node) {
198 4 : if (node is Text) {
199 : // ignore \n between single nodes
200 12 : return node.text == '\n' ? '' : node.text;
201 4 : } else if (node is Element) {
202 8 : final tag = node.localName!.toLowerCase();
203 : switch (tag) {
204 4 : case 'em':
205 4 : case 'i':
206 8 : return '*${_walkChildNodes(opts, node)}*';
207 4 : case 'strong':
208 4 : case 'b':
209 8 : return '**${_walkChildNodes(opts, node)}**';
210 4 : case 'u':
211 4 : case 'ins':
212 4 : return '__${_walkChildNodes(opts, node)}__';
213 4 : case 'del':
214 4 : case 'strike':
215 4 : case 's':
216 4 : return '~~${_walkChildNodes(opts, node)}~~';
217 4 : case 'code':
218 4 : return '`${node.text}`';
219 4 : case 'pre':
220 4 : return '```${_parsePreContent(opts, node)}```\n';
221 4 : case 'a':
222 8 : final href = node.attributes['href'] ?? '';
223 4 : final content = _walkChildNodes(opts, node);
224 8 : if (href.toLowerCase().startsWith('https://matrix.to/#/') ||
225 8 : href.toLowerCase().startsWith('matrix:')) {
226 : return content;
227 : }
228 4 : return '🔗$content';
229 4 : case 'img':
230 4 : return node.attributes['alt'] ??
231 0 : node.attributes['title'] ??
232 0 : node.attributes['src'] ??
233 : '';
234 4 : case 'br':
235 : return '\n';
236 4 : case 'blockquote':
237 2 : return _parseBlockquoteContent(opts, node);
238 4 : case 'ul':
239 2 : return _parseUlContent(opts, node);
240 4 : case 'ol':
241 2 : return _parseOlContent(opts, node);
242 4 : case 'mx-reply':
243 : return '';
244 4 : case 'hr':
245 : return '\n----------\n';
246 4 : case 'h1':
247 4 : case 'h2':
248 4 : case 'h3':
249 4 : case 'h4':
250 4 : case 'h5':
251 4 : case 'h6':
252 12 : final mark = '#' * int.parse(tag[1]);
253 8 : return '$mark ${_walkChildNodes(opts, node)}\n';
254 4 : case 'span':
255 2 : return _parseSpanContent(opts, node);
256 : default:
257 4 : return _walkChildNodes(opts, node);
258 : }
259 : } else {
260 4 : return _walkChildNodes(opts, node);
261 : }
262 : }
263 : }
264 :
265 : class _ConvertOpts {
266 : int listDepth = 0;
267 : }
|