LCOV - code coverage report
Current view: top level - lib/src/utils - html_to_text.dart (source / functions) Hit Total Coverage
Test: merged.info Lines: 129 131 98.5 %
Date: 2024-12-27 12:56:30 Functions: 0 0 -

          Line data    Source code
       1             : /*
       2             :  *   Famedly Matrix SDK
       3             :  *   Copyright (C) 2021 Famedly GmbH
       4             :  *
       5             :  *   This program is free software: you can redistribute it and/or modify
       6             :  *   it under the terms of the GNU Affero General Public License as
       7             :  *   published by the Free Software Foundation, either version 3 of the
       8             :  *   License, or (at your option) any later version.
       9             :  *
      10             :  *   This program is distributed in the hope that it will be useful,
      11             :  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
      12             :  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
      13             :  *   GNU Affero General Public License for more details.
      14             :  *
      15             :  *   You should have received a copy of the GNU Affero General Public License
      16             :  *   along with this program.  If not, see <https://www.gnu.org/licenses/>.
      17             :  */
      18             : 
      19             : import 'package:collection/collection.dart';
      20             : import 'package:html/dom.dart';
      21             : import 'package:html/parser.dart';
      22             : import 'package:html_unescape/html_unescape.dart';
      23             : 
      24             : class HtmlToText {
      25             :   /// Convert an HTML string to a pseudo-markdown plain text representation, with
      26             :   /// `data-mx-spoiler` spans redacted
      27           4 :   static String convert(String html) {
      28             :     // riot-web is notorious for creating bad reply fallback events from invalid messages which, if
      29             :     // not handled properly, can lead to impersonation. As such, we strip the entire `<mx-reply>` tags
      30             :     // here already, to prevent that from happening.
      31             :     // We do *not* do this in an AST and just with simple regex here, as riot-web tends to create
      32             :     // miss-matching tags, and this way we actually correctly identify what we want to strip and, well,
      33             :     // strip it.
      34           4 :     final renderHtml = html.replaceAll(
      35           4 :       RegExp(
      36             :         '<mx-reply>.*</mx-reply>',
      37             :         caseSensitive: false,
      38             :         multiLine: false,
      39             :         dotAll: true,
      40             :       ),
      41             :       '',
      42             :     );
      43             : 
      44           4 :     final opts = _ConvertOpts();
      45           8 :     var reply = _walkNode(opts, parseFragment(renderHtml));
      46           8 :     reply = reply.replaceAll(RegExp(r'\s*$', multiLine: false), '');
      47             :     return reply;
      48             :   }
      49             : 
      50           2 :   static String _parsePreContent(_ConvertOpts opts, Element node) {
      51           2 :     var text = node.innerHtml;
      52             :     final match =
      53           2 :         RegExp(r'^<code([^>]*)>', multiLine: false, caseSensitive: false)
      54           2 :             .firstMatch(text);
      55             :     if (match == null) {
      56           4 :       text = HtmlUnescape().convert(text);
      57           2 :       if (text.isNotEmpty) {
      58           4 :         if (text[0] != '\n') {
      59           2 :           text = '\n$text';
      60             :         }
      61           8 :         if (text[text.length - 1] != '\n') {
      62           2 :           text += '\n';
      63             :         }
      64             :       }
      65             :       return text;
      66             :     }
      67             :     // remove <code> opening tag
      68           4 :     text = text.substring(match.end);
      69             :     // remove the </code> closing tag
      70           2 :     text = text.replaceAll(
      71           2 :       RegExp(r'</code>$', multiLine: false, caseSensitive: false),
      72             :       '',
      73             :     );
      74           4 :     text = HtmlUnescape().convert(text);
      75           2 :     if (text.isNotEmpty) {
      76           4 :       if (text[0] != '\n') {
      77           2 :         text = '\n$text';
      78             :       }
      79           8 :       if (text[text.length - 1] != '\n') {
      80           2 :         text += '\n';
      81             :       }
      82             :     }
      83             :     final language =
      84           2 :         RegExp(r'language-(\w+)', multiLine: false, caseSensitive: false)
      85           4 :             .firstMatch(match.group(1)!);
      86             :     if (language != null) {
      87           4 :       text = language.group(1)! + text;
      88             :     }
      89             :     return text;
      90             :   }
      91             : 
      92           2 :   static String _parseBlockquoteContent(_ConvertOpts opts, Element node) {
      93           2 :     final msg = _walkChildNodes(opts, node);
      94          12 :     return '${msg.split('\n').map((s) => '> $s').join('\n')}\n';
      95             :   }
      96             : 
      97           2 :   static String _parseSpanContent(_ConvertOpts opts, Element node) {
      98           2 :     final content = _walkChildNodes(opts, node);
      99           6 :     if (node.attributes['data-mx-spoiler'] is String) {
     100           4 :       var spoiler = '█' * content.length;
     101           4 :       final reason = node.attributes['data-mx-spoiler'];
     102           2 :       if (reason != '') {
     103           2 :         spoiler = '($reason) $spoiler';
     104             :       }
     105             :       return spoiler;
     106             :     }
     107             :     return content;
     108             :   }
     109             : 
     110           2 :   static String _parseUlContent(_ConvertOpts opts, Element node) {
     111           4 :     opts.listDepth++;
     112           4 :     final entries = _listChildNodes(opts, node, {'li'});
     113           4 :     opts.listDepth--;
     114             :     final bulletPoint =
     115           8 :         _listBulletPoints[opts.listDepth % _listBulletPoints.length];
     116             : 
     117             :     return entries
     118           2 :         .map(
     119           2 :           (s) =>
     120          14 :               '${'    ' * opts.listDepth}$bulletPoint ${s.replaceAll('\n', '\n${'    ' * opts.listDepth}  ')}',
     121             :         )
     122           2 :         .join('\n');
     123             :   }
     124             : 
     125           2 :   static String _parseOlContent(_ConvertOpts opts, Element node) {
     126           4 :     opts.listDepth++;
     127           4 :     final entries = _listChildNodes(opts, node, {'li'});
     128           4 :     opts.listDepth--;
     129           4 :     final startStr = node.attributes['start'];
     130           2 :     final start = (startStr is String &&
     131           4 :             RegExp(r'^[0-9]+$', multiLine: false).hasMatch(startStr))
     132           2 :         ? int.parse(startStr)
     133             :         : 1;
     134             : 
     135             :     return entries
     136           2 :         .mapIndexed(
     137           2 :           (index, s) =>
     138          16 :               '${'    ' * opts.listDepth}${start + index}. ${s.replaceAll('\n', '\n${'    ' * opts.listDepth}  ')}',
     139             :         )
     140           2 :         .join('\n');
     141             :   }
     142             : 
     143             :   static const _listBulletPoints = <String>['●', '○', '■', '‣'];
     144             : 
     145           2 :   static List<String> _listChildNodes(
     146             :     _ConvertOpts opts,
     147             :     Element node, [
     148             :     Iterable<String>? types,
     149             :   ]) {
     150           2 :     final replies = <String>[];
     151           4 :     for (final child in node.nodes) {
     152             :       if (types != null &&
     153           2 :           types.isNotEmpty &&
     154           2 :           ((child is Text) ||
     155           2 :               ((child is Element) &&
     156           6 :                   !types.contains(child.localName!.toLowerCase())))) {
     157             :         continue;
     158             :       }
     159           4 :       replies.add(_walkNode(opts, child));
     160             :     }
     161             :     return replies;
     162             :   }
     163             : 
     164             :   static const _blockTags = <String>{
     165             :     'blockquote',
     166             :     'ul',
     167             :     'ol',
     168             :     'h1',
     169             :     'h2',
     170             :     'h3',
     171             :     'h4',
     172             :     'h5',
     173             :     'h6',
     174             :     'pre',
     175             :   };
     176             : 
     177           4 :   static String _walkChildNodes(_ConvertOpts opts, Node node) {
     178             :     var reply = '';
     179             :     var lastTag = '';
     180           8 :     for (final child in node.nodes) {
     181          12 :       final thisTag = child is Element ? child.localName!.toLowerCase() : '';
     182           8 :       if (thisTag == 'p' && lastTag == 'p') {
     183           2 :         reply += '\n\n';
     184           4 :       } else if (_blockTags.contains(thisTag) &&
     185           4 :           reply.isNotEmpty &&
     186           8 :           reply[reply.length - 1] != '\n') {
     187           2 :         reply += '\n';
     188             :       }
     189           8 :       reply += _walkNode(opts, child);
     190           4 :       if (thisTag.isNotEmpty) {
     191             :         lastTag = thisTag;
     192             :       }
     193             :     }
     194             :     return reply;
     195             :   }
     196             : 
     197           4 :   static String _walkNode(_ConvertOpts opts, Node node) {
     198           4 :     if (node is Text) {
     199             :       // ignore \n between single nodes
     200          12 :       return node.text == '\n' ? '' : node.text;
     201           4 :     } else if (node is Element) {
     202           8 :       final tag = node.localName!.toLowerCase();
     203             :       switch (tag) {
     204           4 :         case 'em':
     205           4 :         case 'i':
     206           8 :           return '*${_walkChildNodes(opts, node)}*';
     207           4 :         case 'strong':
     208           4 :         case 'b':
     209           8 :           return '**${_walkChildNodes(opts, node)}**';
     210           4 :         case 'u':
     211           4 :         case 'ins':
     212           4 :           return '__${_walkChildNodes(opts, node)}__';
     213           4 :         case 'del':
     214           4 :         case 'strike':
     215           4 :         case 's':
     216           4 :           return '~~${_walkChildNodes(opts, node)}~~';
     217           4 :         case 'code':
     218           4 :           return '`${node.text}`';
     219           4 :         case 'pre':
     220           4 :           return '```${_parsePreContent(opts, node)}```\n';
     221           4 :         case 'a':
     222           8 :           final href = node.attributes['href'] ?? '';
     223           4 :           final content = _walkChildNodes(opts, node);
     224           8 :           if (href.toLowerCase().startsWith('https://matrix.to/#/') ||
     225           8 :               href.toLowerCase().startsWith('matrix:')) {
     226             :             return content;
     227             :           }
     228           4 :           return '🔗$content';
     229           4 :         case 'img':
     230           4 :           return node.attributes['alt'] ??
     231           0 :               node.attributes['title'] ??
     232           0 :               node.attributes['src'] ??
     233             :               '';
     234           4 :         case 'br':
     235             :           return '\n';
     236           4 :         case 'blockquote':
     237           2 :           return _parseBlockquoteContent(opts, node);
     238           4 :         case 'ul':
     239           2 :           return _parseUlContent(opts, node);
     240           4 :         case 'ol':
     241           2 :           return _parseOlContent(opts, node);
     242           4 :         case 'mx-reply':
     243             :           return '';
     244           4 :         case 'hr':
     245             :           return '\n----------\n';
     246           4 :         case 'h1':
     247           4 :         case 'h2':
     248           4 :         case 'h3':
     249           4 :         case 'h4':
     250           4 :         case 'h5':
     251           4 :         case 'h6':
     252          12 :           final mark = '#' * int.parse(tag[1]);
     253           8 :           return '$mark ${_walkChildNodes(opts, node)}\n';
     254           4 :         case 'span':
     255           2 :           return _parseSpanContent(opts, node);
     256             :         default:
     257           4 :           return _walkChildNodes(opts, node);
     258             :       }
     259             :     } else {
     260           4 :       return _walkChildNodes(opts, node);
     261             :     }
     262             :   }
     263             : }
     264             : 
     265             : class _ConvertOpts {
     266             :   int listDepth = 0;
     267             : }

Generated by: LCOV version 1.14