luci-base: fix UTF handling in super fast hash function

follow-up fix for 2babc47ae2

the previous code did not handle:
-the 4 byte encoding case
-utf-16 surrogate pairs

Signed-off-by: Paul Donald <newtwen+github@gmail.com>
This commit is contained in:
Paul Donald 2025-04-12 22:04:32 +02:00
parent 5ca9b36a38
commit 04f2d129d7
No known key found for this signature in database
GPG Key ID: 3FC4A933962871D2
1 changed files with 11 additions and 2 deletions

View File

@ -32,6 +32,15 @@ function sfh(s) {
for (var i = 0; i < s.length; i++) { for (var i = 0; i < s.length; i++) {
var ch = s.charCodeAt(i); var ch = s.charCodeAt(i);
// Handle surrogate pairs
if (ch >= 0xD800 && ch <= 0xDBFF && i + 1 < s.length) {
const next = s.charCodeAt(i + 1);
if (next >= 0xDC00 && next <= 0xDFFF) {
ch = 0x10000 + ((ch - 0xD800) << 10) + (next - 0xDC00);
i++;
}
}
if (ch <= 0x7F) if (ch <= 0x7F)
bytes.push(ch); bytes.push(ch);
else if (ch <= 0x7FF) else if (ch <= 0x7FF)
@ -41,7 +50,7 @@ function sfh(s) {
bytes.push(((ch >>> 12) & 0x0F) | 0xE0, bytes.push(((ch >>> 12) & 0x0F) | 0xE0,
((ch >>> 6) & 0x3F) | 0x80, ((ch >>> 6) & 0x3F) | 0x80,
( ch & 0x3F) | 0x80); ( ch & 0x3F) | 0x80);
else if (code <= 0x10FFFF) else if (ch <= 0x10FFFF)
bytes.push(((ch >>> 18) & 0x07) | 0xF0, bytes.push(((ch >>> 18) & 0x07) | 0xF0,
((ch >>> 12) & 0x3F) | 0x80, ((ch >>> 12) & 0x3F) | 0x80,
((ch >> 6) & 0x3F) | 0x80, ((ch >> 6) & 0x3F) | 0x80,
@ -91,7 +100,7 @@ function sfh(s) {
hash = (hash ^ (hash << 25)) >>> 0; hash = (hash ^ (hash << 25)) >>> 0;
hash += hash >>> 6; hash += hash >>> 6;
return (0x100000000 + hash).toString(16).substr(1); return (0x100000000 + hash).toString(16).slice(1);
} }
var plural_function = null; var plural_function = null;