All: Resolves #4613: Improve search with Asian scripts (#5018)

pull/5064/head
mbalint 2021-06-07 16:15:04 +02:00 committed by GitHub
parent 824afd4809
commit 62a371b9f3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 937 additions and 900 deletions

View File

@ -409,6 +409,12 @@ For more information see [Plugins](https://github.com/laurent22/joplin/blob/dev/
Joplin implements the SQLite Full Text Search (FTS4) extension. It means the content of all the notes is indexed in real time and search queries return results very fast. Both [Simple FTS Queries](https://www.sqlite.org/fts3.html#simple_fts_queries) and [Full-Text Index Queries](https://www.sqlite.org/fts3.html#full_text_index_queries) are supported. See below for the list of supported queries:
One caveat of SQLite FTS is that it does not support languages which do not use Latin word boundaries (spaces, tabs, punctuation). To solve this issue, Joplin has a custom search mode, that does not use FTS, but still has all of its features (multi term search, filters, etc.). One of its drawbacks is that it can get slow on larger note collections. Also, the sorting of the results will be less accurate, as the ranking algorithm (BM25) is, for now, only implemented for FTS. Finally, in this mode there are no restrictions on using the `*` wildcard (`swim*`, `*swim` and `ast*rix` all work). This search mode is currently enabled if one of the following languages are detected:
- Chinese
- Japanese
- Korean
- Thai
## Supported queries
Search type | Description | Example

View File

@ -386,6 +386,7 @@ describe('services_SearchEngine', function() {
expect((await engine.search('测试')).length).toBe(1);
expect((await engine.search('测试'))[0].fields).toEqual(['body']);
expect((await engine.search('测试*'))[0].fields).toEqual(['body']);
expect((await engine.search('any:1 type:todo 测试')).length).toBe(1);
}));
it('should support queries with Japanese characters', (async () => {
@ -398,7 +399,7 @@ describe('services_SearchEngine', function() {
expect((await engine.search('できません')).length).toBe(1);
expect((await engine.search('できません*'))[0].fields.sort()).toEqual(['body', 'title']); // usually assume that keyword was matched in body
expect((await engine.search('テスト'))[0].fields.sort()).toEqual(['body']);
expect((await engine.search('any:1 type:todo テスト')).length).toBe(1);
}));
it('should support queries with Korean characters', (async () => {
@ -409,6 +410,7 @@ describe('services_SearchEngine', function() {
expect((await engine.search('이것은')).length).toBe(1);
expect((await engine.search('말')).length).toBe(1);
expect((await engine.search('any:1 type:todo 말')).length).toBe(1);
}));
it('should support queries with Thai characters', (async () => {
@ -419,28 +421,7 @@ describe('services_SearchEngine', function() {
expect((await engine.search('นี่คือค')).length).toBe(1);
expect((await engine.search('ไทย')).length).toBe(1);
}));
it('should support field restricted queries with Chinese characters', (async () => {
let rows;
const n1 = await Note.save({ title: '你好', body: '我是法国人' });
await engine.syncTables();
expect((await engine.search('title:你好*')).length).toBe(1);
expect((await engine.search('title:你好*'))[0].fields).toEqual(['title']);
expect((await engine.search('body:法国人')).length).toBe(1);
expect((await engine.search('body:法国人'))[0].fields).toEqual(['body']);
expect((await engine.search('body:你好')).length).toBe(0);
expect((await engine.search('title:你好 body:法国人')).length).toBe(1);
expect((await engine.search('title:你好 body:法国人'))[0].fields.sort()).toEqual(['body', 'title']);
expect((await engine.search('title:你好 body:bla')).length).toBe(0);
expect((await engine.search('title:你好 我是')).length).toBe(1);
expect((await engine.search('title:你好 我是'))[0].fields.sort()).toEqual(['body', 'title']);
expect((await engine.search('title:bla 我是')).length).toBe(0);
// For non-alpha char, only the first field is looked at, the following ones are ignored
// expect((await engine.search('title:你好 title:hello')).length).toBe(1);
expect((await engine.search('any:1 type:todo ไทย')).length).toBe(1);
}));
it('should parse normal query strings', (async () => {

View File

@ -17,6 +17,7 @@ export default class SearchEngine {
public static relevantFields = 'id, title, body, user_created_time, user_updated_time, is_todo, todo_completed, todo_due, parent_id, latitude, longitude, altitude, source_url';
public static SEARCH_TYPE_AUTO = 'auto';
public static SEARCH_TYPE_BASIC = 'basic';
public static SEARCH_TYPE_NONLATIN_SCRIPT = 'nonlatin';
public static SEARCH_TYPE_FTS = 'fts';
public dispatch: Function = (_o: any) => {};
@ -533,6 +534,7 @@ export default class SearchEngine {
determineSearchType_(query: string, preferredSearchType: any) {
if (preferredSearchType === SearchEngine.SEARCH_TYPE_BASIC) return SearchEngine.SEARCH_TYPE_BASIC;
if (preferredSearchType === SearchEngine.SEARCH_TYPE_NONLATIN_SCRIPT) return SearchEngine.SEARCH_TYPE_NONLATIN_SCRIPT;
// If preferredSearchType is "fts" we auto-detect anyway
// because it's not always supported.
@ -547,10 +549,15 @@ export default class SearchEngine {
const textQuery = allTerms.filter(x => x.name === 'text' || x.name == 'title' || x.name == 'body').map(x => x.value).join(' ');
const st = scriptType(textQuery);
if (!Setting.value('db.ftsEnabled') || ['ja', 'zh', 'ko', 'th'].indexOf(st) >= 0) {
if (!Setting.value('db.ftsEnabled')) {
return SearchEngine.SEARCH_TYPE_BASIC;
}
// Non-alphabetical languages aren't support by SQLite FTS (except with extensions which are not available in all platforms)
if (['ja', 'zh', 'ko', 'th'].indexOf(st) >= 0) {
return SearchEngine.SEARCH_TYPE_NONLATIN_SCRIPT;
}
return SearchEngine.SEARCH_TYPE_FTS;
}
@ -565,7 +572,6 @@ export default class SearchEngine {
const parsedQuery = await this.parseQuery(searchString);
if (searchType === SearchEngine.SEARCH_TYPE_BASIC) {
// Non-alphabetical languages aren't support by SQLite FTS (except with extensions which are not available in all platforms)
searchString = this.normalizeText_(searchString);
const rows = await this.basicSearch(searchString);
@ -579,10 +585,11 @@ export default class SearchEngine {
// when searching.
// https://github.com/laurent22/joplin/issues/1075#issuecomment-459258856
const useFts = searchType === SearchEngine.SEARCH_TYPE_FTS;
try {
const { query, params } = queryBuilder(parsedQuery.allTerms);
const { query, params } = queryBuilder(parsedQuery.allTerms, useFts);
const rows = await this.db().selectAll(query, params);
this.processResults_(rows, parsedQuery);
this.processResults_(rows, parsedQuery, !useFts);
return rows;
} catch (error) {
this.logger().warn(`Cannot execute MATCH query: ${searchString}: ${error.message}`);

File diff suppressed because it is too large Load Diff

View File

@ -21,7 +21,7 @@ enum Requirement {
INCLUSION = 'INCLUSION',
}
const _notebookFilter = (notebooks: string[], requirement: Requirement, conditions: string[], params: string[], withs: string[]) => {
const _notebookFilter = (notebooks: string[], requirement: Requirement, conditions: string[], params: string[], withs: string[], useFts: boolean) => {
if (notebooks.length === 0) return;
const likes = [];
@ -50,12 +50,13 @@ const _notebookFilter = (notebooks: string[], requirement: Requirement, conditio
ON folders.parent_id=${viewName}.id
)`;
const tableName = useFts ? 'notes_normalized' : 'notes';
const where = `
AND ROWID ${requirement === Requirement.EXCLUSION ? 'NOT' : ''} IN (
SELECT notes_normalized.ROWID
SELECT ${tableName}.ROWID
FROM ${viewName}
JOIN notes_normalized
ON ${viewName}.id=notes_normalized.parent_id
JOIN ${tableName}
ON ${viewName}.id=${tableName}.parent_id
)`;
@ -65,12 +66,12 @@ const _notebookFilter = (notebooks: string[], requirement: Requirement, conditio
};
const notebookFilter = (terms: Term[], conditions: string[], params: string[], withs: string[]) => {
const notebookFilter = (terms: Term[], conditions: string[], params: string[], withs: string[], useFts: boolean) => {
const notebooksToInclude = terms.filter(x => x.name === 'notebook' && !x.negated).map(x => x.value);
_notebookFilter(notebooksToInclude, Requirement.INCLUSION, conditions, params, withs);
_notebookFilter(notebooksToInclude, Requirement.INCLUSION, conditions, params, withs, useFts);
const notebooksToExclude = terms.filter(x => x.name === 'notebook' && x.negated).map(x => x.value);
_notebookFilter(notebooksToExclude, Requirement.EXCLUSION, conditions, params, withs);
_notebookFilter(notebooksToExclude, Requirement.EXCLUSION, conditions, params, withs, useFts);
};
@ -87,7 +88,8 @@ const filterByTableName = (
noteIDs: string,
requirement: Requirement,
withs: string[],
tableName: string
tableName: string,
useFts: boolean
) => {
const operator: Operation = getOperator(requirement, relation);
@ -144,13 +146,14 @@ const filterByTableName = (
}
// Get the ROWIDs that satisfy the condition so we can filter the result
const targetTableName = useFts ? 'notes_normalized' : 'notes';
const whereCondition = `
${relation} ROWID ${(relation === 'AND' && requirement === 'EXCLUSION') ? 'NOT' : ''}
IN (
SELECT notes_normalized.ROWID
SELECT ${targetTableName}.ROWID
FROM notes_with_${requirement}_${tableName}
JOIN notes_normalized
ON notes_with_${requirement}_${tableName}.id=notes_normalized.id
JOIN ${targetTableName}
ON notes_with_${requirement}_${tableName}.id=${targetTableName}.id
)`;
withs.push(withCondition);
@ -159,7 +162,7 @@ const filterByTableName = (
};
const resourceFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation, withs: string[]) => {
const resourceFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation, withs: string[], useFts: boolean) => {
const tableName = 'resources';
const resourceIDs = `
@ -177,15 +180,15 @@ const resourceFilter = (terms: Term[], conditions: string[], params: string[], r
const excludedResources = terms.filter(x => x.name === 'resource' && x.negated);
if (requiredResources.length > 0) {
filterByTableName(requiredResources, conditions, params, relation, noteIDsWithResource, Requirement.INCLUSION, withs, tableName);
filterByTableName(requiredResources, conditions, params, relation, noteIDsWithResource, Requirement.INCLUSION, withs, tableName, useFts);
}
if (excludedResources.length > 0) {
filterByTableName(excludedResources, conditions, params, relation, noteIDsWithResource, Requirement.EXCLUSION, withs, tableName);
filterByTableName(excludedResources, conditions, params, relation, noteIDsWithResource, Requirement.EXCLUSION, withs, tableName, useFts);
}
};
const tagFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation, withs: string[]) => {
const tagFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation, withs: string[], useFts: boolean) => {
const tableName = 'tags';
const tagIDs = `
@ -203,30 +206,32 @@ const tagFilter = (terms: Term[], conditions: string[], params: string[], relati
const excludedTags = terms.filter(x => x.name === 'tag' && x.negated);
if (requiredTags.length > 0) {
filterByTableName(requiredTags, conditions, params, relation, noteIDsWithTag, Requirement.INCLUSION, withs, tableName);
filterByTableName(requiredTags, conditions, params, relation, noteIDsWithTag, Requirement.INCLUSION, withs, tableName, useFts);
}
if (excludedTags.length > 0) {
filterByTableName(excludedTags, conditions, params, relation, noteIDsWithTag, Requirement.EXCLUSION, withs, tableName);
filterByTableName(excludedTags, conditions, params, relation, noteIDsWithTag, Requirement.EXCLUSION, withs, tableName, useFts);
}
};
const genericFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation, fieldName: string) => {
const genericFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation, fieldName: string, useFts: boolean) => {
if (fieldName === 'iscompleted' || fieldName === 'type') {
// Faster query when values can only take two distinct values
biConditionalFilter(terms, conditions, relation, fieldName);
biConditionalFilter(terms, conditions, relation, fieldName, useFts);
return;
}
const tableName = useFts ? 'notes_normalized' : 'notes';
const getCondition = (term: Term) => {
if (fieldName === 'sourceurl') {
return `notes_normalized.source_url ${term.negated ? 'NOT' : ''} LIKE ?`;
return `${tableName}.source_url ${term.negated ? 'NOT' : ''} LIKE ?`;
} else if (fieldName === 'date' && term.name === 'due') {
return `todo_due ${term.negated ? '<' : '>='} ?`;
} else if (fieldName === 'id') {
return `id ${term.negated ? 'NOT' : ''} LIKE ?`;
} else {
return `notes_normalized.${fieldName === 'date' ? `user_${term.name}_time` : `${term.name}`} ${term.negated ? '<' : '>='} ?`;
return `${tableName}.${fieldName === 'date' ? `user_${term.name}_time` : `${term.name}`} ${term.negated ? '<' : '>='} ?`;
}
};
@ -234,16 +239,16 @@ const genericFilter = (terms: Term[], conditions: string[], params: string[], re
conditions.push(`
${relation} ( ${term.name === 'due' ? 'is_todo IS 1 AND ' : ''} ROWID IN (
SELECT ROWID
FROM notes_normalized
FROM ${tableName}
WHERE ${getCondition(term)}
))`);
params.push(term.value);
});
};
const biConditionalFilter = (terms: Term[], conditions: string[], relation: Relation, filterName: string) => {
const biConditionalFilter = (terms: Term[], conditions: string[], relation: Relation, filterName: string, useFts: boolean) => {
const getCondition = (filterName: string , value: string, relation: Relation) => {
const tableName = (relation === 'AND') ? 'notes_fts' : 'notes_normalized';
const tableName = useFts ? (relation === 'AND' ? 'notes_fts' : 'notes_normalized') : 'notes';
if (filterName === 'type') {
return `${tableName}.is_todo IS ${value === 'todo' ? 1 : 0}`;
} else if (filterName === 'iscompleted') {
@ -262,39 +267,44 @@ const biConditionalFilter = (terms: Term[], conditions: string[], relation: Rela
AND ${getCondition(filterName, value, relation)}`);
}
if (relation === 'OR') {
conditions.push(`
OR ROWID IN (
SELECT ROWID
FROM notes_normalized
WHERE ${getCondition(filterName, value, relation)}
)`);
if (useFts) {
conditions.push(`
OR ROWID IN (
SELECT ROWID
FROM notes_normalized
WHERE ${getCondition(filterName, value, relation)}
)`);
} else {
conditions.push(`
OR ${getCondition(filterName, value, relation)}`);
}
}
});
};
const noteIdFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation) => {
const noteIdFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation, useFts: boolean) => {
const noteIdTerms = terms.filter(x => x.name === 'id');
genericFilter(noteIdTerms, conditions, params, relation, 'id');
genericFilter(noteIdTerms, conditions, params, relation, 'id', useFts);
};
const typeFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation) => {
const typeFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation, useFts: boolean) => {
const typeTerms = terms.filter(x => x.name === 'type');
genericFilter(typeTerms, conditions, params, relation, 'type');
genericFilter(typeTerms, conditions, params, relation, 'type', useFts);
};
const completedFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation) => {
const completedFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation, useFts: boolean) => {
const completedTerms = terms.filter(x => x.name === 'iscompleted');
genericFilter(completedTerms, conditions, params, relation, 'iscompleted');
genericFilter(completedTerms, conditions, params, relation, 'iscompleted', useFts);
};
const locationFilter = (terms: Term[], conditons: string[], params: string[], relation: Relation) => {
const locationFilter = (terms: Term[], conditons: string[], params: string[], relation: Relation, useFts: boolean) => {
const locationTerms = terms.filter(x => x.name === 'latitude' || x.name === 'longitude' || x.name === 'altitude');
genericFilter(locationTerms, conditons, params, relation, 'location');
genericFilter(locationTerms, conditons, params, relation, 'location', useFts);
};
const dateFilter = (terms: Term[], conditons: string[], params: string[], relation: Relation) => {
const dateFilter = (terms: Term[], conditons: string[], params: string[], relation: Relation, useFts: boolean) => {
const getUnixMs = (date: string): string => {
const yyyymmdd = /^[0-9]{8}$/;
const yyyymm = /^[0-9]{6}$/;
@ -321,44 +331,61 @@ const dateFilter = (terms: Term[], conditons: string[], params: string[], relati
const dateTerms = terms.filter(x => x.name === 'created' || x.name === 'updated' || x.name === 'due');
const unixDateTerms = dateTerms.map(term => { return { ...term, value: getUnixMs(term.value) }; });
genericFilter(unixDateTerms, conditons, params, relation, 'date');
genericFilter(unixDateTerms, conditons, params, relation, 'date', useFts);
};
const sourceUrlFilter = (terms: Term[], conditons: string[], params: string[], relation: Relation) => {
const sourceUrlFilter = (terms: Term[], conditons: string[], params: string[], relation: Relation, useFts: boolean) => {
const urlTerms = terms.filter(x => x.name === 'sourceurl');
genericFilter(urlTerms, conditons, params, relation, 'sourceurl');
genericFilter(urlTerms, conditons, params, relation, 'sourceurl', useFts);
};
const trimQuotes = (str: string) => str.startsWith('"') && str.endsWith('"') ? str.substr(1, str.length - 2) : str;
const textFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation, useFts: boolean) => {
const createLikeMatch = (term: Term, negate: boolean) => {
const query = `${relation} ${negate ? 'NOT' : ''} (
${(term.name === 'text' || term.name === 'body') ? 'notes.body LIKE ? ' : ''}
${term.name === 'text' ? 'OR' : ''}
${(term.name === 'text' || term.name === 'title') ? 'notes.title LIKE ? ' : ''})`;
conditions.push(query);
const param = `%${trimQuotes(term.value).replace(/\*/, '%')}%`;
params.push(param);
if (term.name === 'text') params.push(param);
};
const textFilter = (terms: Term[], conditions: string[], params: string[], relation: Relation) => {
const addExcludeTextConditions = (excludedTerms: Term[], conditions: string[], params: string[], relation: Relation) => {
const type = excludedTerms[0].name === 'text' ? '' : `.${excludedTerms[0].name}`;
if (relation === 'AND') {
conditions.push(`
AND ROWID NOT IN (
SELECT ROWID
FROM notes_fts
WHERE notes_fts${type} MATCH ?
)`);
params.push(excludedTerms.map(x => x.value).join(' OR '));
}
if (relation === 'OR') {
excludedTerms.forEach(term => {
if (useFts) {
const type = excludedTerms[0].name === 'text' ? '' : `.${excludedTerms[0].name}`;
if (relation === 'AND') {
conditions.push(`
OR ROWID IN (
SELECT *
FROM (
SELECT ROWID
FROM notes_fts
EXCEPT
SELECT ROWID
FROM notes_fts
WHERE notes_fts${type} MATCH ?
)
AND ROWID NOT IN (
SELECT ROWID
FROM notes_fts
WHERE notes_fts${type} MATCH ?
)`);
params.push(term.value);
params.push(excludedTerms.map(x => x.value).join(' OR '));
}
if (relation === 'OR') {
excludedTerms.forEach(term => {
conditions.push(`
OR ROWID IN (
SELECT *
FROM (
SELECT ROWID
FROM notes_fts
EXCEPT
SELECT ROWID
FROM notes_fts
WHERE notes_fts${type} MATCH ?
)
)`);
params.push(term.value);
});
}
} else {
excludedTerms.forEach(term => {
createLikeMatch(term, true);
});
}
};
@ -367,13 +394,19 @@ const textFilter = (terms: Term[], conditions: string[], params: string[], relat
const includedTerms = allTerms.filter(x => !x.negated);
if (includedTerms.length > 0) {
conditions.push(`${relation} notes_fts MATCH ?`);
const termsToMatch = includedTerms.map(term => {
if (term.name === 'text') return term.value;
else return `${term.name}:${term.value}`;
});
const matchQuery = (relation === 'OR') ? termsToMatch.join(' OR ') : termsToMatch.join(' ');
params.push(matchQuery);
if (useFts) {
conditions.push(`${relation} notes_fts MATCH ?`);
const termsToMatch = includedTerms.map(term => {
if (term.name === 'text') return term.value;
else return `${term.name}:${term.value}`;
});
const matchQuery = (relation === 'OR') ? termsToMatch.join(' OR ') : termsToMatch.join(' ');
params.push(matchQuery);
} else {
includedTerms.forEach(term => {
createLikeMatch(term, false);
});
}
}
const excludedTextTerms = allTerms.filter(x => x.name === 'text' && x.negated);
@ -404,47 +437,48 @@ const getConnective = (terms: Term[], relation: Relation): string => {
return (!notebookTerm && (relation === 'OR')) ? 'ROWID=-1' : '1'; // ROWID=-1 acts as 0 (something always false)
};
export default function queryBuilder(terms: Term[]) {
export default function queryBuilder(terms: Term[], useFts: boolean) {
const queryParts: string[] = [];
const params: string[] = [];
const withs: string[] = [];
const relation: Relation = getDefaultRelation(terms);
const tableName = useFts ? 'notes_fts' : 'notes';
queryParts.push(`
SELECT
notes_fts.id,
notes_fts.title,
offsets(notes_fts) AS offsets,
matchinfo(notes_fts, 'pcnalx') AS matchinfo,
notes_fts.user_created_time,
notes_fts.user_updated_time,
notes_fts.is_todo,
notes_fts.todo_completed,
notes_fts.parent_id
FROM notes_fts
${tableName}.id,
${tableName}.title,
${useFts ? 'offsets(notes_fts) AS offsets, matchinfo(notes_fts, \'pcnalx\') AS matchinfo,' : ''}
${tableName}.user_created_time,
${tableName}.user_updated_time,
${tableName}.is_todo,
${tableName}.todo_completed,
${tableName}.parent_id
FROM ${tableName}
WHERE ${getConnective(terms, relation)}`);
noteIdFilter(terms, queryParts, params, relation);
noteIdFilter(terms, queryParts, params, relation, useFts);
notebookFilter(terms, queryParts, params, withs);
notebookFilter(terms, queryParts, params, withs, useFts);
tagFilter(terms, queryParts, params, relation, withs);
tagFilter(terms, queryParts, params, relation, withs, useFts);
resourceFilter(terms, queryParts, params, relation, withs);
resourceFilter(terms, queryParts, params, relation, withs, useFts);
textFilter(terms, queryParts, params, relation);
textFilter(terms, queryParts, params, relation, useFts);
typeFilter(terms, queryParts, params, relation);
typeFilter(terms, queryParts, params, relation, useFts);
completedFilter(terms, queryParts, params, relation);
completedFilter(terms, queryParts, params, relation, useFts);
dateFilter(terms, queryParts, params, relation);
dateFilter(terms, queryParts, params, relation, useFts);
locationFilter(terms, queryParts, params, relation);
locationFilter(terms, queryParts, params, relation, useFts);
sourceUrlFilter(terms, queryParts, params, relation);
sourceUrlFilter(terms, queryParts, params, relation, useFts);
let query;
if (withs.length > 0) {