diff --git a/README.md b/README.md index bb592e2..816c1b5 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,13 @@ # Specification of the technical architecture, interface definitions and data exchange format(s) See [https://ocr-d.github.io/](https://ocr-d.github.io/). + +## Line Ground Truth + +* [Spec](./gt-spec.md) +* [BagIt profile](./gt-profile.yml) + +## Engine training + +* [Spec](./training-spec.md) +* [JSON schema](./training-schema.yml) diff --git a/gt-profile.json b/gt-profile.json new file mode 100644 index 0000000..004e732 --- /dev/null +++ b/gt-profile.json @@ -0,0 +1 @@ +{"BagIt-Profile-Info":{"BagIt-Profile-Identifier":"https://ocr-d.github.io/gt-profile.json","BagIt-Profile-Version":"1.2.0","Source-Organization":"OCR-D","External-Description":"BagIt profile for OCR line Ground Truth","Contact-Name":"Konstantin Baierer","Contact-Email":"konstantin.baierer@sbb.spk-berlin.de","Version":0.1},"Bag-Info":{"Bagging-Date":{"required":false},"Source-Organization":{"required":false},"Gt-Transcription-Extension":{"required":false,"default":".gt.txt"},"Gt-Transcription-Media-Type":{"required":false,"default":"text/plain"},"Gt-Prediction-Directory":{"required":false,"default":"pred"},"Gt-Prediction-Extension":{"required":false,"default":".pred.txt"},"Gt-Prediction-Media-Type":{"required":false,"default":"text/plain"},"Gt-Transcription-Directory":{"required":false,"default":"text"},"Gt-Transcription-Normalization":{"required":true,"values":["NFD","NFKD","NFC","NFKC","non-normalized"]},"Gt-Color-Image-Extension":{"required":false,"default":".color.png"},"Gt-Color-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Color-Image-Directory":{"required":false,"default":"img"},"Gt-Grayscale-Image-Extension":{"required":false,"default":".nrm.png"},"Gt-Grayscale-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Grayscale-Image-Directory":{"required":false,"default":"grayscale"},"Gt-Bitonal-Image-Extension":{"required":false,"default":".bin.png"},"Gt-Bitonal-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff"]},"Gt-Bitonal-Image-Directory":{"required":false,"default":"bin"},"Gt-Line-Metadata-Extension":{"required":false,"default":".json"},"Gt-Line-Metadata-Media-Type":{"required":false,"default":"application/json","values":["application/json","text/vnd.yaml"]},"Gt-Line-Metadata-Directory":{"required":false,"default":"meta"},"Gt-Directory":{"required":false,"default":"ground-truth"},"Gt-Directory-Structure":{"required":false,"default":"flat","values":["flat","flat-nested","subfolders","subfolders-nested"]}},"Manifests-Required":["sha512"],"Tag-Manifests-Required":[],"Tag-Files-Required":[],"Tag-Files-Allowed":["README.md","build.sh"],"Allow-Fetch.txt":false,"Serialization":"allowed","Accept-Serialization":"application/zip","Accept-BagIt-Version":["1.0"]} \ No newline at end of file diff --git a/gt-profile.yml b/gt-profile.yml new file mode 100644 index 0000000..4457e5a --- /dev/null +++ b/gt-profile.yml @@ -0,0 +1,115 @@ +BagIt-Profile-Info: + BagIt-Profile-Identifier: https://ocr-d.github.io/gt-profile.json + BagIt-Profile-Version: '1.2.0' + Source-Organization: OCR-D + External-Description: BagIt profile for OCR line Ground Truth + Contact-Name: Konstantin Baierer + Contact-Email: konstantin.baierer@sbb.spk-berlin.de + Version: 0.1 +Bag-Info: + Bagging-Date: + required: false + Source-Organization: + required: false + Gt-Transcription-Extension: + required: false + default: '.gt.txt' + Gt-Transcription-Media-Type: + required: false + default: 'text/plain' + Gt-Prediction-Directory: + required: false + default: 'pred' + Gt-Prediction-Extension: + required: false + default: '.pred.txt' + Gt-Prediction-Media-Type: + required: false + default: 'text/plain' + Gt-Transcription-Directory: + required: false + default: 'text' + Gt-Transcription-Normalization: + required: true + values: + - NFD + - NFKD + - NFC + - NFKC + - non-normalized + Gt-Color-Image-Extension: + required: false + default: '.color.png' + Gt-Color-Image-Media-Type: + required: false + default: 'image/png' + values: + - 'image/png' + - 'image/tiff' + - 'image/jpeg' + Gt-Color-Image-Directory: + required: false + default: 'img' + Gt-Grayscale-Image-Extension: + required: false + default: '.nrm.png' + Gt-Grayscale-Image-Media-Type: + required: false + default: 'image/png' + values: + - 'image/png' + - 'image/tiff' + - 'image/jpeg' + Gt-Grayscale-Image-Directory: + required: false + default: 'grayscale' + Gt-Bitonal-Image-Extension: + required: false + default: '.bin.png' + Gt-Bitonal-Image-Media-Type: + required: false + default: 'image/png' + values: + - 'image/png' + - 'image/tiff' + Gt-Bitonal-Image-Directory: + required: false + default: 'bin' + Gt-Line-Metadata-Extension: + required: false + default: '.json' + Gt-Line-Metadata-Media-Type: + required: false + default: 'application/json' + values: + - 'application/json' + - 'text/vnd.yaml' + Gt-Line-Metadata-Directory: + required: false + default: 'meta' + Gt-Directory: + required: false + default: 'ground-truth' + Gt-Directory-Structure: + required: false + default: 'flat' + values: + # img and transcription in the Gt-Directory + - 'flat' + # img and transcription in the same dir below Gt-Directory + - 'flat-nested' + # img and transcription in subfolders Gt-Bitonal-Image-Directory and Gt-Transcription-Directory of Gt-Directory + - 'subfolders' + # img and transcription in subfolders Gt-Bitonal-Image-Directory and Gt-Transcription-Directory in the same dir below Gt-Directory + - 'subfolders-nested' +Manifests-Required: ['sha512'] +Tag-Manifests-Required: [] +Tag-Files-Required: [] +Tag-Files-Allowed: + - README.md + - build.sh +Allow-Fetch.txt: false +Serialization: allowed +Accept-Serialization: application/zip +Accept-BagIt-Version: + - '1.0' diff --git a/gt-spec.md b/gt-spec.md new file mode 100644 index 0000000..d718b41 --- /dev/null +++ b/gt-spec.md @@ -0,0 +1,218 @@ +# linegt + +> An exchange format for line-based ground truth for OCR + + +* [Rationale](#rationale) +* [BagIt](#bagit) +* [BagIt profile](#bagit-profile) + * [Gt-Transcription-Extension](#gt-transcription-extension) + * [Gt-Transcription-Media-Type](#gt-transcription-media-type) + * [Gt-Transcription-Directory](#gt-transcription-directory) + * [Gt-Transcription-Normalization](#gt-transcription-normalization) + * [Gt-Prediction-Extension](#gt-prediction-extension) + * [Gt-Prediction-Media-Type](#gt-prediction-media-type) + * [Gt-Prediction-Directory](#gt-prediction-directory) + * [Gt-Grayscale-Image-Extension](#gt-grayscale-image-extension) + * [Gt-Grayscale-Image-Media-Type](#gt-grayscale-image-media-type) + * [Gt-Grayscale-Image-Directory](#gt-grayscale-image-directory) + * [Gt-Color-Image-Extension](#gt-color-image-extension) + * [Gt-Color-Image-Media-Type](#gt-color-image-media-type) + * [Gt-Color-Image-Directory](#gt-color-image-directory) + * [Gt-Bitonal-Image-Extension](#gt-bitonal-image-extension) + * [Gt-Bitonal-Image-Media-Type](#gt-bitonal-image-media-type) + * [Gt-Bitonal-Image-Directory](#gt-bitonal-image-directory) + * [Gt-Line-Metadata-Extension](#gt-line-metadata-extension) + * [Gt-Line-Metadata-Media-Type](#gt-line-metadata-media-type) + * [Gt-Line-Metadata-Directory](#gt-line-metadata-directory) + * [Gt-Directory](#gt-directory) + * [Gt-Directory-Structure](#gt-directory-structure) +* [Line metadata](#line-metadata) + + + +## Rationale + +Recent OCR (optical character recognition) engines are not actually +character-based anymore but use neural networks that operate on lines. These +engines can be trained with images of text lines and their transcription +("ground truth"), plus engine-specific configurations. + +This format defines a standardized format to bundle such ground truth, based on +the BagIt conventions. + +## BagIt + +An `linegt` bag must be a valid BagIt bag: + +* Root folder must contain a file `bagit.txt` +* Root folder must contain a file `bag-info.txt` with metadata about the bag +* All payload files must be under a folder `/data` +* Every file in `/data` along with its `` checksum must be listed in a + file `manifest-.txt` + +## BagIt profile + +In addition to the requirements of BagIt, an `ocr_linegt` bag must adhere to +the `ocr_linegt` BagIt profile. + +### Gt-Transcription-Extension + +Extension of the transcription files. Default: `.gt.txt`. + +### Gt-Transcription-Media-Type + +Media type of the transcription files. Default: `text/plain`. + +### Gt-Transcription-Directory + +Name of the subfolder containing transcriptions if [`Gt-Directory-Structure`] is `subfolders` or `subfolders-nested`. Default: `text`. + +### Gt-Transcription-Normalization + +**Required** + +All transcriptions MUST be UTF-8 encoded Unicode. This property defines the +unicode normalization level. + +One of `NFC`, `NFKC`, `NFD` or `NFKC` or `non-normalized`. + +![Illustration unicode normalization](http://unicode.org/reports/tr15/images/UAX15-NormFig6.jpg) + +### Gt-Prediction-Extension + +Extension of the prediction files. Used for evaluation. Default: `.pred.txt`. + +### Gt-Prediction-Media-Type + +Media type of the prediction files. Default: `text/plain`. + +### Gt-Prediction-Directory + +Name of the subfolder containing predictions if [`Gt-Directory-Structure`] is `subfolders` or `subfolders-nested`. Default: `pred`. + +### Gt-Grayscale-Image-Extension + +Extension of the grayscale image files. Default: `.nrm.png`. + +### Gt-Grayscale-Image-Media-Type + +Media type of the grayscale image files. Default: `image/png`. + +### Gt-Grayscale-Image-Directory + +Name of the subfolder containing grayscale images if [`Gt-Directory-Structure`] is `subfolders` or `subfolders-nested`. Default: `grayscale`. + +### Gt-Color-Image-Extension + +Extension of the color image files. Default: `.color.png`. + +### Gt-Color-Image-Media-Type + +Media type of the color image files. Default: `image/png`. + +### Gt-Color-Image-Directory + +Name of the subfolder containing color images if [`Gt-Directory-Structure`] is `subfolders` or `subfolders-nested`. Default: `img`. + +### Gt-Bitonal-Image-Extension + +Extension of the bitonal image files. Default: `.bin..png`. + +### Gt-Bitonal-Image-Media-Type + +Media type of the bitonal image files. Default: `image/png`. + +### Gt-Bitonal-Image-Directory + +Name of the subfolder containing bitonal images if [`Gt-Directory-Structure`] is `subfolders` or `subfolders-nested`. Default: `bin`. + +### Gt-Line-Metadata-Extension + +Extension of the [line metadata] files. Default: `.json`. + +### Gt-Line-Metadata-Media-Type + +Media type of the [line metadata] files. Default: `application/json`. + +### Gt-Line-Metadata-Directory + +Name of the subfolder containing [line metadata] if [`Gt-Directory-Structure`] is `subfolders` or `subfolders-nested`. Default: `meta`. + +### Gt-Directory + +Directory below `/data` containing the ground truth. Default: `ground-truth`. + +### Gt-Directory-Structure + +Directory structure. One of + + - `flat`: img and transcription in the [`Gt-Directory`] + - `flat-nested`: img and transcription in the same dir below [`Gt-Directory`] + - `subfolders`: img and transcription in subfolders [`Gt-Bitonal-Image-Directory`] and [`Gt-Transcription-Directory`] of [`Gt-Directory`] + - `subfolders-nested`: img and transcription in subfolders [`Gt-Bitonal-Image-Directory`] and [`Gt-Transcription-Directory`] in the same dir below Gt-Directory + +## Line metadata + +In addition to the bag-wide metadata defined by the [BagIt profile], metadata +can be saved per line to preserve the provenance of every single line. + +Line metadata can be encoded in JSON or YAML (depending on +[`Gt-Line-Metadata-Extension`] and [`Gt-Line-Metadata-Media-Type`]). + +Line metadata MUST adhere to this JSON schema: + + +```yaml +description: Schema for provenance of single lines +type: object +required: + - imageUrl +properties: + coords: + description: Coordinates as array of x-y-pairs + type: array + items: + type: array + length: 2 + items: + type: number + pageUrl: + description: URL of the page (resp. URL the PAGE-XML file) + type: string + imageUrl: + description: URL of the image (resp. the `pg:imageFilename` in the PAGE-XML file) + type: string + bagUrl: + description: URL of the bag that contains the page + type: string + metsUrl: + description: URL of the METS document that contains the page + type: string + lineId: + description: ID of the line within the PAGE-XML doc + type: string + teiUrl: + description: URL of the TEI document that contains the page + type: string + xpath: + description: XPath to the line if no `fileId` was provided + type: string +``` + + + + +[`Gt-Directory`]: #gt-directory +[`Gt-Bitonal-Image-Directory`]: #gt-bitonal-image-directory +[`Gt-Transcription-Directory`]: #gt-transcription-directory +[`Gt-Directory-Structure`]: #gt-directory-structure +[`Gt-Line-Metadata-Directory`]: #gt-bitonal-image-directory +[`Gt-Line-Metadata-Extension`]: #gt-line-metadata-extension +[`Gt-Line-Metadata-Media-Type`]: #gt-line-metadata-media-type +[BagIt Profile]: #bagit-profile +[line metadata]: #line-metadata diff --git a/model-evaluation-schema.json b/model-evaluation-schema.json new file mode 100644 index 0000000..c2039c4 --- /dev/null +++ b/model-evaluation-schema.json @@ -0,0 +1 @@ +{"$id":"https://ocr-d.github.io/schemas/v1/model-evaluation-schema.json","type":"object","required":["engineName","engineVersion","groundTruthBag","model"],"properties":{"engineName":{"type":"string","enum":["ocropus","kraken","tesseract","calamari"]},"engineVersion":{"type":"string"},"recognizerArguments":{"description":"Command line arguments passed to the CLI recognition tool","type":"array","default":[]},"groundTruthBag":{"description":"A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json","type":"string"},"model":{"description":"URL/path to model to use","type":"string"},"measures":{"description":"which evaluation measures to produce","type":"array","items":{"type":"string","enum":["cer-per-line","cer-total","ler","wer-per-line","wer-total","confusion-matrix"]}}}} \ No newline at end of file diff --git a/model-evaluation-schema.yml b/model-evaluation-schema.yml new file mode 100644 index 0000000..e2a1821 --- /dev/null +++ b/model-evaluation-schema.yml @@ -0,0 +1,39 @@ +$id: https://ocr-d.github.io/schemas/v1/model-evaluation-schema.json +type: object +required: + - engineName + - engineVersion + - groundTruthBag + - model +properties: + engineName: + type: string + enum: + - ocropus + - kraken + - tesseract + - calamari + engineVersion: + type: string + recognizerArguments: + description: Command line arguments passed to the CLI recognition tool + type: array + default: [] + groundTruthBag: + description: A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json + type: string + model: + description: URL/path to model to use + type: string + measures: + description: which evaluation measures to produce + type: array + items: + type: string + enum: + - cer-per-line + - cer-total + - ler + - wer-per-line + - wer-total + - confusion-matrix diff --git a/single-line.json b/single-line.json new file mode 100644 index 0000000..503683a --- /dev/null +++ b/single-line.json @@ -0,0 +1 @@ +{"description":"Schema for provenance of single lines","type":"object","required":["imageUrl"],"properties":{"coords":{"description":"Coordinates as array of x-y-pairs","type":"array","items":{"type":"array","length":2,"items":{"type":"number"}}},"pageUrl":{"description":"URL of the page (resp. URL the PAGE-XML file)","type":"string"},"imageUrl":{"description":"URL of the image (resp. the `pg:imageFilename` in the PAGE-XML file)","type":"string"},"bagUrl":{"description":"URL of the bag that contains the page","type":"string"},"metsUrl":{"description":"URL of the METS document that contains the page","type":"string"},"lineId":{"description":"ID of the line within the PAGE-XML doc","type":"string"},"teiUrl":{"description":"URL of the TEI document that contains the page","type":"string"},"xpath":{"description":"XPath to the line if no `fileId` was provided","type":"string"}}} \ No newline at end of file diff --git a/single-line.yml b/single-line.yml new file mode 100644 index 0000000..f5a8712 --- /dev/null +++ b/single-line.yml @@ -0,0 +1,35 @@ +description: Schema for provenance of single lines +type: object +required: + - imageUrl +properties: + coords: + description: Coordinates as array of x-y-pairs + type: array + items: + type: array + length: 2 + items: + type: number + pageUrl: + description: URL of the page (resp. URL the PAGE-XML file) + type: string + imageUrl: + description: URL of the image (resp. the `pg:imageFilename` in the PAGE-XML file) + type: string + bagUrl: + description: URL of the bag that contains the page + type: string + metsUrl: + description: URL of the METS document that contains the page + type: string + lineId: + description: ID of the line within the PAGE-XML doc + type: string + teiUrl: + description: URL of the TEI document that contains the page + type: string + xpath: + description: XPath to the line if no `fileId` was provided + type: string + diff --git a/training-schema.json b/training-schema.json new file mode 100644 index 0000000..a93e497 --- /dev/null +++ b/training-schema.json @@ -0,0 +1 @@ +{"$id":"https://ocr-d.github.io/schemas/v1/training-schema.json","type":"object","required":["engineName","engineVersion","groundTruthBag","outputModelFormat"],"properties":{"engineName":{"type":"string","enum":["ocropus","kraken","tesseract","calamari"]},"engineVersion":{"type":"string"},"trainerArgs":{"description":"Command line arguments passed to the CLI training tool","type":"array","default":[]},"recognizerArgs":{"description":"Command line arguments passed to the CLI recognition tool","type":"array","default":[]},"groundTruthBag":{"description":"A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json","type":"string"},"trainingGlob":{"description":"Wildcard for matching only a subset of the ground truth files for training. Make sure to exclude extensions and end in '*'.","type":"string","default":"*"},"evaluationGlob":{"description":"Wildcard for matching only a subset of the ground truth files for evaluationi. Make sure to exclude extensions and end in '*'.","type":"string","default":"*"},"outputModelFormat":{"description":"The output format of the model. Note that individual engines only support a single one or a subset of formats.","enum":["application/vnd.ocrd.pronn","application/vnd.ocrd.clstm","application/vnd.ocrd.coreml","application/vnd.ocrd.pyrnn","application/vnd.ocrd.tf+zip","application/vnd.ocrd.tesseract4"]},"validationRatio":{"description":"Ratio of training vs. validation data to divide up ground truth","type":"number","default":0.9},"randomSeed":{"description":"Seed for the random number generator shuffling the ground truth before dividing it into training vs. validation data","type":"integer","default":0}}} \ No newline at end of file diff --git a/training-schema.yml b/training-schema.yml new file mode 100644 index 0000000..490e5ca --- /dev/null +++ b/training-schema.yml @@ -0,0 +1,61 @@ +$id: https://ocr-d.github.io/schemas/v1/training-schema.json +type: object +required: + - engineName + - engineVersion + - groundTruthBag + - outputModelFormat +properties: + engineName: + type: string + enum: + - ocropus + - kraken + - tesseract + - calamari + engineVersion: + type: string + trainerArgs: + description: Command line arguments passed to the CLI training tool + type: array + default: [] + recognizerArgs: + description: Command line arguments passed to the CLI recognition tool + type: array + default: [] + groundTruthBag: + description: A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json + type: string + trainingGlob: + description: Wildcard for matching only a subset of the ground truth files for training. Make sure to exclude extensions and end in '*'. + type: 'string' + default: '*' + # type: array + # default: ['*'] + # items: + # type: string + evaluationGlob: + description: Wildcard for matching only a subset of the ground truth files for evaluationi. Make sure to exclude extensions and end in '*'. + type: 'string' + default: '*' + # type: array + # default: ['*'] + # items: + # type: string + outputModelFormat: + description: The output format of the model. Note that individual engines only support a single one or a subset of formats. + enum: + - application/vnd.ocrd.pronn # kraken < 1.0 + - application/vnd.ocrd.clstm # ocropy-lpred, clstm, kraken<1.0 + - application/vnd.ocrd.coreml # kraken >= 1.0 + - application/vnd.ocrd.pyrnn # ocropy-rpred + - application/vnd.ocrd.tf+zip # calamari, zipped tensorflow data + - application/vnd.ocrd.tesseract4 # tesseract >= 4.0.beta1 + validationRatio: + description: Ratio of training vs. validation data to divide up ground truth + type: number + default: 0.9 + randomSeed: + description: Seed for the random number generator shuffling the ground truth before dividing it into training vs. validation data + type: integer + default: 0