chat: preprocess text msgs w/md4c+tidy-html5

Introduces MessageParser to encapsulate text treatment for raw text messages.

The async parsing sequence is as follows:
- Markdown -> HTML (md4c)
- link coloration (tidy-html5)
- notify UI
- request link preview info from PreviewEngine for the first link
- Preview engine uses QtNetwork instead of QtWebengine
- Linkification is handled by MessageParser instead of linkify.js

QtWebengine is no longer required for message parsing.

Gitlab: #1033
Gitlab: #855
Change-Id: Ief9b91aa291caf284f08230acaf57976f80fa05b
This commit is contained in:
Andreas Traczyk
2023-03-20 16:26:37 -04:00
committed by Sébastien Blin
parent 07527be378
commit 8db188c513
37 changed files with 979 additions and 4312 deletions

View File

@@ -1,100 +0,0 @@
/*
* Copyright (c) 2021 SoapBox Innovations Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
var linkifyStr = (function (linkifyjs) {
'use strict';
/**
Convert strings of text into linkable HTML text
*/
function escapeText(text) {
return text.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
}
function escapeAttr(href) {
return href.replace(/"/g, '&quot;');
}
function attributesToString(attributes) {
var result = [];
for (var attr in attributes) {
var val = attributes[attr] + '';
result.push(attr + "=\"" + escapeAttr(val) + "\"");
}
return result.join(' ');
}
function defaultRender(_ref) {
var tagName = _ref.tagName,
attributes = _ref.attributes,
content = _ref.content;
return "<" + tagName + " " + attributesToString(attributes) + ">" + escapeText(content) + "</" + tagName + ">";
}
/**
* Convert a plan text string to an HTML string with links. Expects that the
* given strings does not contain any HTML entities. Use the linkify-html
* interface if you need to parse HTML entities.
*
* @param {string} str string to linkify
* @param {import('linkifyjs').Opts} [opts] overridable options
* @returns {string}
*/
function linkifyStr(str, opts) {
if (opts === void 0) {
opts = {};
}
opts = new linkifyjs.Options(opts, defaultRender);
var tokens = linkifyjs.tokenize(str);
var result = [];
for (var i = 0; i < tokens.length; i++) {
var token = tokens[i];
if (token.t === 'nl' && opts.get('nl2br')) {
result.push('<br>\n');
} else if (!token.isLink || !opts.check(token)) {
result.push(escapeText(token.toString()));
} else {
result.push(opts.render(token));
}
}
return result.join('');
}
if (!String.prototype.linkify) {
Object.defineProperty(String.prototype, 'linkify', {
writable: false,
value: function linkify(options) {
return linkifyStr(this, options);
}
});
}
return linkifyStr;
})(linkify);

File diff suppressed because one or more lines are too long

View File

@@ -1,126 +0,0 @@
/* MIT License
Copyright (c) 2019 Andrej Gajdos
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.*/
/**
* Retrieves the title of a webpage which is used to fill out the preview of a hyperlink
* @param doc the DOM of the url that is being previewed
* @returns the title of the given webpage
*/
function getTitle(doc){
const og_title = doc.querySelector('meta[property="og:title"]')
if (og_title !== null && og_title.content.length > 0) {
return og_title.content
}
const twitter_title = doc.querySelector('meta[name="twitter:title"]')
if (twitter_title !== null && twitter_title.content.length > 0) {
return twitter_title.content
}
const doc_title = doc.title
if (doc_title !== null && doc_title.length > 0) {
return doc_title
}
if (doc.querySelector("h1") !== null){
const header_1 = doc.querySelector("h1").innerHTML
if (header_1 !== null && header_1.length > 0) {
return header_1
}
}
if (doc.querySelector("h2") !== null){
const header_2 = doc.querySelector("h2").innerHTML
if (header_2 !== null && header_2.length > 0) {
return header_2
}
}
return null
}
/**
* Obtains a description of the webpage for the hyperlink preview
* @param doc the DOM of the url that is being previewed
* @returns a description of the webpage
*/
function getDescription(doc){
const og_description = doc.querySelector('meta[property="og:description"]')
if (og_description !== null && og_description.content.length > 0) {
return og_description.content
}
const twitter_description = doc.querySelector('meta[name="twitter:description"]')
if (twitter_description !== null && twitter_description.content.length > 0) {
return twitter_description.content
}
const meta_description = doc.querySelector('meta[name="description"]')
if (meta_description !== null && meta_description.content.length > 0) {
return meta_description.content
}
var all_paragraphs = doc.querySelectorAll("p")
let first_visible_paragraph = null
for (var i = 0; i < all_paragraphs.length; i++) {
if (all_paragraphs[i].offsetParent !== null &&
!all_paragraphs[i].childElementCount !== 0) {
first_visible_paragraph = all_paragraphs[i].textContent
break
}
}
return first_visible_paragraph
}
/**
* Gets the image that represents a webpage.
* @param doc the DOM of the url that is being previewed
* @returns the image representing the url or null if no such image was found
*/
function getImage(doc) {
const og_image = doc.querySelector('meta[property="og:image"]')
if (og_image !== null && og_image.content.length > 0){
return og_image.content
}
const image_rel_link = doc.querySelector('link[rel="image_src"]')
if (image_rel_link !== null && image_rel_link.href.length > 0){
return image_rel_link.href
}
const twitter_img = doc.querySelector('meta[name="twitter:image"]')
if (twitter_img !== null && twitter_img.content.length > 0) {
return twitter_img.content
}
let imgs = Array.from(doc.getElementsByTagName("img"))
if (imgs.length > 0) {
imgs = imgs.filter(img => {
let add_image = true
if (img.naturalWidth > img.naturalHeight) {
if (img.naturalWidth / img.naturalHeight > 3) {
add_image = false
}
} else {
if (img.naturalHeight / img.naturalWidth > 3) {
add_image = false
}
}
if (img.naturalHeight <= 50 || img.naturalWidth <= 50) {
add_image = false
}
return add_image
})
}
return null
}

View File

@@ -1,93 +0,0 @@
_ = new QWebChannel(qt.webChannelTransport, function (channel) {
window.jsbridge = channel.objects.jsbridge
})
function log(msg) {
window.jsbridge.log(msg)
}
function getPreviewInfo(messageId, url) {
var title = null
var description = null
var image = null
var u = new URL(url)
if (u.protocol === '') {
url = "https://".concat(url)
}
var domain = (new URL(url))
fetch(url, {
mode: 'no-cors',
headers: {'Set-Cookie': 'SameSite=None; Secure'}
}).then(function (response) {
const contentType = response.headers.get('content-type');
if (!contentType || !contentType.includes('text/html')) {
return
}
return response.body
}).then(body => {
const reader = body.getReader();
return new ReadableStream({
start(controller) {
return pump();
function pump() {
return reader.read().then(({ done, value }) => {
// When no more data needs to be consumed, close the stream
if (done) {
controller.close();
return;
}
if(value.byteLength > 2*1024*1024) {
controller.close();
return;
}
// Enqueue the next data chunk into our target stream
controller.enqueue(value);
return pump();
});
}
}
})
}, e => Promise.reject(e))
.then(stream => new Response(stream))
.then(response => response.text())
.then(function (html) {
// create DOM from html string
var parser = new DOMParser()
var doc = parser.parseFromString(html, "text/html")
if (!url.includes("twitter.com")){
title = getTitle(doc)
} else {
title = "Twitter. It's what's happening."
}
image = getImage(doc, url)
description = getDescription(doc)
domain = (domain.hostname).replace("www.", "")
}).catch(function (err) {
log("Error occured while fetching document: " + err)
}).finally(() => {
window.jsbridge.emitInfoReady(messageId, {
'title': title,
'image': image,
'description': description,
'url': url,
'domain': domain,
})
})
}
function parseMessage(messageId, message, showPreview, color='#0645AD') {
var links = linkify.find(message)
if (links.length === 0) {
return
}
if (showPreview)
getPreviewInfo(messageId, links[0].href)
window.jsbridge.emitLinkified(messageId, linkifyStr(message, {
attributes: {
style: "color:" + color + ";"
}
}))
}