mirror of https://github.com/laurent22/joplin.git
parent
6ce55a5737
commit
806377e6ee
|
@ -22,6 +22,7 @@ module.exports = {
|
|||
'ReadableStreamDefaultReader': 'readonly',
|
||||
'FileSystemCreateWritableOptions': 'readonly',
|
||||
'FileSystemHandle': 'readonly',
|
||||
'IDBTransactionMode': 'readonly',
|
||||
|
||||
// ServiceWorker
|
||||
'ExtendableEvent': 'readonly',
|
||||
|
|
|
@ -130,7 +130,7 @@ class Application extends BaseApplication {
|
|||
}
|
||||
|
||||
if (action.type === 'SETTING_UPDATE_ONE' && action.key === 'ocr.enabled' || action.type === 'SETTING_UPDATE_ALL') {
|
||||
this.setupOcrService();
|
||||
void this.setupOcrService();
|
||||
}
|
||||
|
||||
if (action.type === 'SETTING_UPDATE_ONE' && action.key === 'style.editor.fontFamily' || action.type === 'SETTING_UPDATE_ALL') {
|
||||
|
@ -360,16 +360,29 @@ class Application extends BaseApplication {
|
|||
Setting.setValue('wasClosedSuccessfully', false);
|
||||
}
|
||||
|
||||
private setupOcrService() {
|
||||
private async setupOcrService() {
|
||||
if (Setting.value('ocr.clearLanguageDataCache')) {
|
||||
Setting.setValue('ocr.clearLanguageDataCache', false);
|
||||
try {
|
||||
await OcrDriverTesseract.clearLanguageDataCache();
|
||||
} catch (error) {
|
||||
this.logger().warn('OCR: Failed to clear language data cache.', error);
|
||||
}
|
||||
}
|
||||
|
||||
if (Setting.value('ocr.enabled')) {
|
||||
|
||||
if (!this.ocrService_) {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Old code before rule was applied
|
||||
const Tesseract = (window as any).Tesseract;
|
||||
|
||||
const driver = new OcrDriverTesseract(
|
||||
{ createWorker: Tesseract.createWorker },
|
||||
`${bridge().buildDir()}/tesseract.js/worker.min.js`,
|
||||
`${bridge().buildDir()}/tesseract.js-core`,
|
||||
{
|
||||
workerPath: `${bridge().buildDir()}/tesseract.js/worker.min.js`,
|
||||
corePath: `${bridge().buildDir()}/tesseract.js-core`,
|
||||
languageDataPath: Setting.value('ocr.languageDataPath') || null,
|
||||
},
|
||||
);
|
||||
|
||||
this.ocrService_ = new OcrService(driver);
|
||||
|
|
|
@ -105,6 +105,10 @@ class ConfigScreenComponent extends React.Component<any, any> {
|
|||
Setting.setValue('sync.startupOperation', SyncStartupOperation.ClearLocalData);
|
||||
await Setting.saveAll();
|
||||
await restart();
|
||||
} else if (key === 'ocr.clearLanguageDataCacheButton') {
|
||||
if (!confirm(this.restartMessage())) return;
|
||||
Setting.setValue('ocr.clearLanguageDataCache', true);
|
||||
await restart();
|
||||
} else if (key === 'sync.openSyncWizard') {
|
||||
this.props.dispatch({
|
||||
type: 'DIALOG_OPEN',
|
||||
|
|
|
@ -498,6 +498,34 @@ const builtInMetadata = (Setting: typeof SettingType) => {
|
|||
isGlobal: true,
|
||||
},
|
||||
|
||||
'ocr.languageDataPath': {
|
||||
value: '',
|
||||
type: SettingItemType.String,
|
||||
advanced: true,
|
||||
public: true,
|
||||
appTypes: [AppType.Desktop],
|
||||
label: () => _('OCR: Language data URL or path'),
|
||||
storage: SettingStorage.File,
|
||||
isGlobal: true,
|
||||
},
|
||||
|
||||
'ocr.clearLanguageDataCache': {
|
||||
value: false,
|
||||
type: SettingItemType.Bool,
|
||||
public: false,
|
||||
appTypes: [AppType.Desktop],
|
||||
storage: SettingStorage.Database,
|
||||
},
|
||||
|
||||
'ocr.clearLanguageDataCacheButton': {
|
||||
value: null as null,
|
||||
type: SettingItemType.Button,
|
||||
advanced: true,
|
||||
public: true,
|
||||
appTypes: [AppType.Desktop],
|
||||
label: () => _('OCR: Clear cache and re-download language data files'),
|
||||
},
|
||||
|
||||
theme: {
|
||||
value: Setting.THEME_LIGHT,
|
||||
type: SettingItemType.Int,
|
||||
|
|
|
@ -4,7 +4,6 @@ import OcrDriverBase from '../OcrDriverBase';
|
|||
import { Minute } from '@joplin/utils/time';
|
||||
import shim from '../../../shim';
|
||||
import Logger from '@joplin/utils/Logger';
|
||||
import Setting from '../../../models/Setting';
|
||||
|
||||
const logger = Logger.create('OcrDriverTesseract');
|
||||
|
||||
|
@ -29,18 +28,60 @@ const formatTesseractBoundingBox = (boundingBox: Tesseract.Bbox): RecognizeResul
|
|||
// Above this is usually reliable.
|
||||
const minConfidence = 70;
|
||||
|
||||
interface Options {
|
||||
workerPath: string;
|
||||
corePath: string;
|
||||
languageDataPath: string|null;
|
||||
}
|
||||
|
||||
export default class OcrDriverTesseract extends OcrDriverBase {
|
||||
|
||||
private tesseract_: Tesseract = null;
|
||||
private workerPath_: string|null = null;
|
||||
private corePath_: string|null = null;
|
||||
private workerPath_: string;
|
||||
private corePath_: string;
|
||||
private languageDataPath_: string|null = null;
|
||||
private workers_: Record<string, WorkerWrapper[]> = {};
|
||||
|
||||
public constructor(tesseract: Tesseract, workerPath: string|null = null, corePath: string|null = null) {
|
||||
public constructor(tesseract: Tesseract, { workerPath, corePath, languageDataPath }: Options) {
|
||||
super();
|
||||
this.tesseract_ = tesseract;
|
||||
this.workerPath_ = workerPath;
|
||||
this.corePath_ = corePath;
|
||||
this.languageDataPath_ = languageDataPath;
|
||||
}
|
||||
|
||||
public static async clearLanguageDataCache() {
|
||||
if (typeof indexedDB === 'undefined') {
|
||||
throw new Error('Missing indexedDB access!');
|
||||
}
|
||||
|
||||
logger.info('Clearing cached language data...');
|
||||
|
||||
const requestAsPromise = <T> (request: IDBRequest) => {
|
||||
return new Promise<T>((resolve, reject) => {
|
||||
request.addEventListener('success', () => { resolve(request.result); });
|
||||
request.addEventListener('error', (event) => {
|
||||
if ('error' in event) {
|
||||
reject(new Error(`Request failed: ${event.error}`));
|
||||
} else {
|
||||
reject(new Error('Request failed with unknown error.'));
|
||||
}
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
const db = await requestAsPromise<IDBDatabase>(indexedDB.open('keyval-store'));
|
||||
const getStore = (mode: IDBTransactionMode) => {
|
||||
return db.transaction(['keyval'], mode).objectStore('keyval');
|
||||
};
|
||||
|
||||
const allKeys = await requestAsPromise<string[]>(getStore('readonly').getAllKeys());
|
||||
const languageDataExtension = '.traineddata';
|
||||
const keysToClear = allKeys.filter(key => key.endsWith(languageDataExtension));
|
||||
for (const key of keysToClear) {
|
||||
logger.info('Clearing language data with key', key);
|
||||
await requestAsPromise(getStore('readwrite').delete(key));
|
||||
}
|
||||
}
|
||||
|
||||
private async acquireWorker(language: string) {
|
||||
|
@ -59,10 +100,7 @@ export default class OcrDriverTesseract extends OcrDriverBase {
|
|||
|
||||
if (this.workerPath_) createWorkerOptions.workerPath = this.workerPath_;
|
||||
if (this.corePath_) createWorkerOptions.corePath = this.corePath_;
|
||||
|
||||
// Getting the language files seems to be broken but maybe only on dev, and this is fixed by
|
||||
// disabling the cache: https://github.com/naptha/tesseract.js/issues/901
|
||||
if (Setting.value('env') === 'dev') createWorkerOptions.cacheMethod = 'none';
|
||||
if (this.languageDataPath_) createWorkerOptions.langPath = this.languageDataPath_;
|
||||
|
||||
const worker = await this.tesseract_.createWorker(language, OEM.LSTM_ONLY, createWorkerOptions);
|
||||
|
||||
|
|
|
@ -1073,7 +1073,7 @@ const simulateReadOnlyShareEnv = (shareId: string, store?: Store) => {
|
|||
};
|
||||
|
||||
export const newOcrService = () => {
|
||||
const driver = new OcrDriverTesseract({ createWorker });
|
||||
const driver = new OcrDriverTesseract({ createWorker }, { workerPath: null, corePath: null, languageDataPath: null });
|
||||
return new OcrService(driver);
|
||||
};
|
||||
|
||||
|
|
|
@ -121,3 +121,5 @@ COEP
|
|||
Stormlikes
|
||||
Stormlikes
|
||||
BYTV
|
||||
keyval
|
||||
traineddata
|
||||
|
|
|
@ -29,3 +29,11 @@ OCR is a technology that evolves rapidly especially with the recent advances in
|
|||
Additionally in some cases it may make sense to use a cloud-based solution, or simply connect to your self-hosted or intranet-based server for OCR. The current system will allow this by writing specific drivers for these services.
|
||||
|
||||
This pluggable interface is present in the software but not currently exposed. We will do so depending on feedback we receive and potential use cases. If you have any specific use case in mind or notice any issue with the current OCR system feel free to let us know [on the forum](https://discourse.joplinapp.org/).
|
||||
|
||||
## Custom OCR language data URL
|
||||
|
||||
After enabling OCR, Joplin downloads language files from https://cdn.jsdelivr.net/npm/@tesseract.js-data/. This URL can be customized in settings > advanced > "OCR: Language data URL or path". This URL or path should point to a directory with a `.traineddata.gz` file for each language to be used for OCR.
|
||||
|
||||
For reference, an example `.traineddata.gz` file can be found [here](https://cdn.jsdelivr.net/npm/@tesseract.js-data/eng/4.0.0_best_int/eng.traineddata.gz).
|
||||
|
||||
To fully replace the cached language data with custom data, it may be necessary to click "Clear cache and re-download language data files".
|
||||
|
|
|
@ -13,6 +13,7 @@ In order to provide certain features, Joplin may need to connect to third-party
|
|||
| Spellchecker dictionary | On Linux and Windows, the desktop application downloads the spellchecker dictionary from `redirector.gvt1.com`. | Enabled | Yes <sup>(2)</sup> |
|
||||
| Plugin repository | The desktop application downloads the list of available plugins from the [official GitHub repository](https://github.com/joplin/plugins). If this repository is not accessible (eg. in China) the app will try to get the plugin list from [various mirrors](https://github.com/laurent22/joplin/blob/8ac6017c02017b6efd59f5fcab7e0b07f8d44164/packages/lib/services/plugins/RepositoryApi.ts#L22), in which case the plugin screen [works slightly differently](https://github.com/laurent22/joplin/issues/5161#issuecomment-925226975). | Enabled | No
|
||||
| Voice typing | If you use the voice typing feature on Android, the application will download the language files from https://alphacephei.com/vosk/models | Disabled | Yes
|
||||
| OCR | If you have enabled optical character recognition on desktop, the application will download the language files from https://cdn.jsdelivr.net/npm/@tesseract.js-data/. | Disabled | Yes
|
||||
| Crash reports | If you have enabled crash auto-upload, the application will upload the report to Sentry when a crash happens. When Sentry is initialised it will also connect to `sentry.io`. | Disabled | Yes
|
||||
|
||||
<sup>(1) https://github.com/laurent22/joplin/issues/5705</sup><br/>
|
||||
|
|
Loading…
Reference in New Issue