Desktop: Resolves #10835: Allow specifying custom language data URLs (#10846)

pull/10849/head
Henry Heino 2024-08-09 03:29:39 -07:00 committed by GitHub
parent 6ce55a5737
commit 806377e6ee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 108 additions and 13 deletions

View File

@ -22,6 +22,7 @@ module.exports = {
'ReadableStreamDefaultReader': 'readonly',
'FileSystemCreateWritableOptions': 'readonly',
'FileSystemHandle': 'readonly',
'IDBTransactionMode': 'readonly',
// ServiceWorker
'ExtendableEvent': 'readonly',

View File

@ -130,7 +130,7 @@ class Application extends BaseApplication {
}
if (action.type === 'SETTING_UPDATE_ONE' && action.key === 'ocr.enabled' || action.type === 'SETTING_UPDATE_ALL') {
this.setupOcrService();
void this.setupOcrService();
}
if (action.type === 'SETTING_UPDATE_ONE' && action.key === 'style.editor.fontFamily' || action.type === 'SETTING_UPDATE_ALL') {
@ -360,16 +360,29 @@ class Application extends BaseApplication {
Setting.setValue('wasClosedSuccessfully', false);
}
private setupOcrService() {
private async setupOcrService() {
if (Setting.value('ocr.clearLanguageDataCache')) {
Setting.setValue('ocr.clearLanguageDataCache', false);
try {
await OcrDriverTesseract.clearLanguageDataCache();
} catch (error) {
this.logger().warn('OCR: Failed to clear language data cache.', error);
}
}
if (Setting.value('ocr.enabled')) {
if (!this.ocrService_) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Old code before rule was applied
const Tesseract = (window as any).Tesseract;
const driver = new OcrDriverTesseract(
{ createWorker: Tesseract.createWorker },
`${bridge().buildDir()}/tesseract.js/worker.min.js`,
`${bridge().buildDir()}/tesseract.js-core`,
{
workerPath: `${bridge().buildDir()}/tesseract.js/worker.min.js`,
corePath: `${bridge().buildDir()}/tesseract.js-core`,
languageDataPath: Setting.value('ocr.languageDataPath') || null,
},
);
this.ocrService_ = new OcrService(driver);

View File

@ -105,6 +105,10 @@ class ConfigScreenComponent extends React.Component<any, any> {
Setting.setValue('sync.startupOperation', SyncStartupOperation.ClearLocalData);
await Setting.saveAll();
await restart();
} else if (key === 'ocr.clearLanguageDataCacheButton') {
if (!confirm(this.restartMessage())) return;
Setting.setValue('ocr.clearLanguageDataCache', true);
await restart();
} else if (key === 'sync.openSyncWizard') {
this.props.dispatch({
type: 'DIALOG_OPEN',

View File

@ -498,6 +498,34 @@ const builtInMetadata = (Setting: typeof SettingType) => {
isGlobal: true,
},
'ocr.languageDataPath': {
value: '',
type: SettingItemType.String,
advanced: true,
public: true,
appTypes: [AppType.Desktop],
label: () => _('OCR: Language data URL or path'),
storage: SettingStorage.File,
isGlobal: true,
},
'ocr.clearLanguageDataCache': {
value: false,
type: SettingItemType.Bool,
public: false,
appTypes: [AppType.Desktop],
storage: SettingStorage.Database,
},
'ocr.clearLanguageDataCacheButton': {
value: null as null,
type: SettingItemType.Button,
advanced: true,
public: true,
appTypes: [AppType.Desktop],
label: () => _('OCR: Clear cache and re-download language data files'),
},
theme: {
value: Setting.THEME_LIGHT,
type: SettingItemType.Int,

View File

@ -4,7 +4,6 @@ import OcrDriverBase from '../OcrDriverBase';
import { Minute } from '@joplin/utils/time';
import shim from '../../../shim';
import Logger from '@joplin/utils/Logger';
import Setting from '../../../models/Setting';
const logger = Logger.create('OcrDriverTesseract');
@ -29,18 +28,60 @@ const formatTesseractBoundingBox = (boundingBox: Tesseract.Bbox): RecognizeResul
// Above this is usually reliable.
const minConfidence = 70;
interface Options {
workerPath: string;
corePath: string;
languageDataPath: string|null;
}
export default class OcrDriverTesseract extends OcrDriverBase {
private tesseract_: Tesseract = null;
private workerPath_: string|null = null;
private corePath_: string|null = null;
private workerPath_: string;
private corePath_: string;
private languageDataPath_: string|null = null;
private workers_: Record<string, WorkerWrapper[]> = {};
public constructor(tesseract: Tesseract, workerPath: string|null = null, corePath: string|null = null) {
public constructor(tesseract: Tesseract, { workerPath, corePath, languageDataPath }: Options) {
super();
this.tesseract_ = tesseract;
this.workerPath_ = workerPath;
this.corePath_ = corePath;
this.languageDataPath_ = languageDataPath;
}
public static async clearLanguageDataCache() {
if (typeof indexedDB === 'undefined') {
throw new Error('Missing indexedDB access!');
}
logger.info('Clearing cached language data...');
const requestAsPromise = <T> (request: IDBRequest) => {
return new Promise<T>((resolve, reject) => {
request.addEventListener('success', () => { resolve(request.result); });
request.addEventListener('error', (event) => {
if ('error' in event) {
reject(new Error(`Request failed: ${event.error}`));
} else {
reject(new Error('Request failed with unknown error.'));
}
});
});
};
const db = await requestAsPromise<IDBDatabase>(indexedDB.open('keyval-store'));
const getStore = (mode: IDBTransactionMode) => {
return db.transaction(['keyval'], mode).objectStore('keyval');
};
const allKeys = await requestAsPromise<string[]>(getStore('readonly').getAllKeys());
const languageDataExtension = '.traineddata';
const keysToClear = allKeys.filter(key => key.endsWith(languageDataExtension));
for (const key of keysToClear) {
logger.info('Clearing language data with key', key);
await requestAsPromise(getStore('readwrite').delete(key));
}
}
private async acquireWorker(language: string) {
@ -59,10 +100,7 @@ export default class OcrDriverTesseract extends OcrDriverBase {
if (this.workerPath_) createWorkerOptions.workerPath = this.workerPath_;
if (this.corePath_) createWorkerOptions.corePath = this.corePath_;
// Getting the language files seems to be broken but maybe only on dev, and this is fixed by
// disabling the cache: https://github.com/naptha/tesseract.js/issues/901
if (Setting.value('env') === 'dev') createWorkerOptions.cacheMethod = 'none';
if (this.languageDataPath_) createWorkerOptions.langPath = this.languageDataPath_;
const worker = await this.tesseract_.createWorker(language, OEM.LSTM_ONLY, createWorkerOptions);

View File

@ -1073,7 +1073,7 @@ const simulateReadOnlyShareEnv = (shareId: string, store?: Store) => {
};
export const newOcrService = () => {
const driver = new OcrDriverTesseract({ createWorker });
const driver = new OcrDriverTesseract({ createWorker }, { workerPath: null, corePath: null, languageDataPath: null });
return new OcrService(driver);
};

View File

@ -121,3 +121,5 @@ COEP
Stormlikes
Stormlikes
BYTV
keyval
traineddata

View File

@ -29,3 +29,11 @@ OCR is a technology that evolves rapidly especially with the recent advances in
Additionally in some cases it may make sense to use a cloud-based solution, or simply connect to your self-hosted or intranet-based server for OCR. The current system will allow this by writing specific drivers for these services.
This pluggable interface is present in the software but not currently exposed. We will do so depending on feedback we receive and potential use cases. If you have any specific use case in mind or notice any issue with the current OCR system feel free to let us know [on the forum](https://discourse.joplinapp.org/).
## Custom OCR language data URL
After enabling OCR, Joplin downloads language files from https://cdn.jsdelivr.net/npm/@tesseract.js-data/. This URL can be customized in settings > advanced > "OCR: Language data URL or path". This URL or path should point to a directory with a `.traineddata.gz` file for each language to be used for OCR.
For reference, an example `.traineddata.gz` file can be found [here](https://cdn.jsdelivr.net/npm/@tesseract.js-data/eng/4.0.0_best_int/eng.traineddata.gz).
To fully replace the cached language data with custom data, it may be necessary to click "Clear cache and re-download language data files".

View File

@ -13,6 +13,7 @@ In order to provide certain features, Joplin may need to connect to third-party
| Spellchecker dictionary | On Linux and Windows, the desktop application downloads the spellchecker dictionary from `redirector.gvt1.com`. | Enabled | Yes <sup>(2)</sup> |
| Plugin repository | The desktop application downloads the list of available plugins from the [official GitHub repository](https://github.com/joplin/plugins). If this repository is not accessible (eg. in China) the app will try to get the plugin list from [various mirrors](https://github.com/laurent22/joplin/blob/8ac6017c02017b6efd59f5fcab7e0b07f8d44164/packages/lib/services/plugins/RepositoryApi.ts#L22), in which case the plugin screen [works slightly differently](https://github.com/laurent22/joplin/issues/5161#issuecomment-925226975). | Enabled | No
| Voice typing | If you use the voice typing feature on Android, the application will download the language files from https://alphacephei.com/vosk/models | Disabled | Yes
| OCR | If you have enabled optical character recognition on desktop, the application will download the language files from https://cdn.jsdelivr.net/npm/@tesseract.js-data/. | Disabled | Yes
| Crash reports | If you have enabled crash auto-upload, the application will upload the report to Sentry when a crash happens. When Sentry is initialised it will also connect to `sentry.io`. | Disabled | Yes
<sup>(1) https://github.com/laurent22/joplin/issues/5705</sup><br/>