unicode: ensure 140 char text limit is unicode aware#1576
Conversation
d0e3d11 to
9274422
Compare
|
I'm moving this to DRAFT for now pending further discussion. The current pushed state only implements this functionality for the The issue I'm now having (after pulling in #1577) is writing a testcase for this, since it was easy enough to use emoji to make it fail, but now they are being correctly stripped. Other multi-byte characters such as Kanji have the correct length in JS and the Combining Marks are already being handled, so I can't think of any unicode range (other than using diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js
index 89f547d4..7b419b52 100644
--- a/sanitizer/_text_pelias_parser.js
+++ b/sanitizer/_text_pelias_parser.js
@@ -1,10 +1,10 @@
+const _ = require('lodash');
const logger = require('pelias-logger').get('api');
const unicode = require('../helper/unicode');
const Tokenizer = require('pelias-parser/tokenization/Tokenizer');
const Solution = require('pelias-parser/solver/Solution');
const AddressParser = require('pelias-parser/parser/AddressParser');
const parser = new AddressParser();
-const _ = require('lodash');
const MAX_TEXT_LENGTH = 140;
/**
@@ -38,9 +38,9 @@ function _sanitize (raw, clean) {
else {
// truncate text to $MAX_TEXT_LENGTH chars
- if (text.length > MAX_TEXT_LENGTH) {
+ if (unicode.length(text) > MAX_TEXT_LENGTH) {
messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`);
- text = text.substring(0, MAX_TEXT_LENGTH);
+ text = unicode.substring(text, 0, MAX_TEXT_LENGTH);
}
// parse text with pelias/parser
@@ -53,7 +53,7 @@ function _sanitize (raw, clean) {
}
function parse (clean) {
-
+
// parse text
let start = new Date();
const t = new Tokenizer(clean.text);
@@ -225,7 +225,7 @@ function parse (clean) {
}
}
}
-
+
// unknown query type
else {
parsed_text.subject = t.span.body;
diff --git a/test/unit/sanitizer/_text.js b/test/unit/sanitizer/_text.js
index cb55e120..e3492722 100644
--- a/test/unit/sanitizer/_text.js
+++ b/test/unit/sanitizer/_text.js
@@ -143,9 +143,9 @@ module.exports.tests.text_parser = function(test, common) {
test('should truncate very long text inputs', (t) => {
const raw = { text: `
Sometimes we make the process more complicated than we need to.
-We will never make a journey of a thousand miles by fretting about
+We will never make a journey of a thousand miles by fretting about
how long it will take or how hard it will be.
-We make the journey by taking each day step by step and then repeating
+We make the journey by taking each day step by step and then repeating
it again and again until we reach our destination.` };
const clean = {};
const messages = sanitizer.sanitize(raw, clean);
@@ -158,12 +158,12 @@ it again and again until we reach our destination.` };
// https://github.com/pelias/api/issues/1574
test('truncate should be unicode aware', (t) => {
- const raw = { text: 'a' + '👩❤️👩'.repeat(200) };
+ const raw = { text: 'a' + '慶'.repeat(200) };
const clean = {};
const messages = sanitizer.sanitize(raw, clean);
t.equals(unicode.length(clean.text), 140);
- t.equals(clean.text, 'a' + '👩❤️👩'.repeat(139));
+ t.equals(clean.text, 'a' + '慶'.repeat(139));
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
t.end();
diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js
index a169d5d9..848aded7 100644
--- a/test/unit/sanitizer/_text_pelias_parser.js
+++ b/test/unit/sanitizer/_text_pelias_parser.js
@@ -1,5 +1,5 @@
-var sanitizer = require('../../../sanitizer/_text_pelias_parser')();
-var type_mapping = require('../../../helper/type_mapping');
+const sanitizer = require('../../../sanitizer/_text_pelias_parser')();
+const unicode = require('../../../helper/unicode');
module.exports.tests = {};
@@ -20,7 +20,7 @@ module.exports.tests.text_parser = function (test, common) {
});
let cases = [];
-
+
// USA queries
cases.push(['soho, new york, NY', {
subject: 'soho',
@@ -322,7 +322,7 @@ module.exports.tests.text_parser = function (test, common) {
cases.push(['New York', { subject: 'New York' }, true]);
cases.push(['New York N', { subject: 'New York' }, true]);
cases.push(['New York NY', { subject: 'New York' }, true]);
-
+
cases.push(['B', { subject: 'B' }, true]);
cases.push(['Be', { subject: 'Be' }, true]);
cases.push(['Ber', { subject: 'Ber' }, true]);
@@ -421,9 +421,9 @@ module.exports.tests.text_parser = function (test, common) {
const raw = {
text: `
Sometimes we make the process more complicated than we need to.
-We will never make a journey of a thousand miles by fretting about
+We will never make a journey of a thousand miles by fretting about
how long it will take or how hard it will be.
-We make the journey by taking each day step by step and then repeating
+We make the journey by taking each day step by step and then repeating
it again and again until we reach our destination.` };
const clean = {};
const messages = sanitizer.sanitize(raw, clean);
@@ -433,6 +433,19 @@ it again and again until we reach our destination.` };
t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
t.end();
});
+
+ // https://github.com/pelias/api/issues/1574
+ test('truncate should be unicode aware', (t) => {
+ const raw = { text: 'a' + '慶'.repeat(200) };
+ const clean = {};
+ const messages = sanitizer.sanitize(raw, clean);
+
+ t.equals(unicode.length(clean.text), 140);
+ t.equals(clean.text, 'a' + '慶'.repeat(139));
+ t.deepEquals(messages.errors, [], 'no errors');
+ t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
+ t.end();
+ });
};
module.exports.all = function (tape, common) { |
6c21aa8 to
4a0322c
Compare
|
I have revived this PR and using a new segmentation method Intl.Segmenter this code now correctly truncates text to 140 visible characters for both search and autocomplete. |
4a0322c to
55ef42f
Compare
55ef42f to
e0ebf72
Compare
|
note: |
Pelias enforces a 140 character limit for the
?text=param for securty/performance reasons.Prior to this PR the two methods used to implement this behaviour (namely
String.prototype.lengthandString.prototype.substring) were not unicode aware.The linked issue noted that this can cause problems when the
substringfunction truncates the string in a place where a 'unicode code point' is represented by multiple 'unicode code units'.In this case we end up slicing it such a way that the encoding becomes invalid (ie. cutting a two-byte utf8 char after the first byte).
I originally used the package stringz, which is a fairly thin wrapper around the char-regex package.
..but after having a look through the code I discovered that
char-regexwas ported from lodash and I managed to get a reference to the 'internal' function, so we don't need to bring in any new dependencies, plus lodash is well maintained.note: I had a look at trying to detect the charset and handling
utf16->utf8conversion but it's very complicated, at some point I'd be happy with simply detecting and erroring on any non-utf8 charset but even that is not trivial.