Skip to content

Commit

Permalink
bump schema for poa v4 and vor v8 (#1)
Browse files Browse the repository at this point in the history
* manage.sh, adds 'clean' command to tidy up generated files.
* main.go, now uses POA v4 and VOR v8 from the api-raml.
* main.go, non-json files excluded from paths to validate.
* main.go, files to keep buffered in memory default reduced to 1k.
* readme.md, updated help and copyright sections.
  • Loading branch information
lsh-0 authored Mar 6, 2024
1 parent f9ebc06 commit e6521fb
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 31 deletions.
17 changes: 9 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,19 @@

## Usage

$ go run main.go -h
Usage of /tmp/go-build3363302255/b001/exe/main:
$ go run . -h
Usage of /tmp/go-build3486126079/b001/exe/validate-article-json:
-article-json string
the path to a article-json file or directory
path to an article-json file or directory
-buffer-size int
the maximum number of article-json files to keep in memory at once (default 2000)
maximum number of article-json files to keep in memory at once (default 1000)
-num-workers int
the number of workers to process the article-json files (default 12)
number of workers (goroutines) to process the article-json files
0 for number of cpu cores (default), -1 for unbounded
-sample-size int
the number of article-json files to parse (default -1)
number of article-json files to parse (default -1)
-schema-root string
the path to api-raml schema root
path to api-raml schema root

For example:

Expand All @@ -46,6 +47,6 @@ sys 0m0.181s

## Licence

Copyright © 2023 eLife Sciences
Copyright © 2024 eLife Sciences

Distributed under the GNU Affero General Public Licence, version 3.
52 changes: 29 additions & 23 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,16 @@ type Result struct {
Error error
}

// "VOR valid in 2.6ms: elife-09560-v1.xml.json"
// "POA invalid in 123.4ms: elife-09560-v1.xml.json"
func (r Result) String() string {
msg := "%s %s in\t%4dms: %s"
if r.Success {
return fmt.Sprintf(msg, r.Type, "valid", r.Elapsed, r.FileName)
}
return fmt.Sprintf(msg, r.Type, "invalid", r.Elapsed, r.FileName)
}

type Article struct {
Type string // POA or VOR
FileName string
Expand All @@ -57,8 +67,8 @@ func configure_validator(schema_root string) map[string]Schema {
compiler := jsonschema.NewCompiler()
compiler.Draft = jsonschema.Draft4
schema_file_list := map[string]string{
"POA": path.Join(schema_root, "/dist/model/article-poa.v3.json"),
"VOR": path.Join(schema_root, "/dist/model/article-vor.v7.json"),
"POA": path.Join(schema_root, "/dist/model/article-poa.v4.json"),
"VOR": path.Join(schema_root, "/dist/model/article-vor.v8.json"),
}

schema_map := map[string]Schema{}
Expand All @@ -67,7 +77,7 @@ func configure_validator(schema_root string) map[string]Schema {
panic_on_err(err, fmt.Sprintf("reading '%s' schema file: %s", label, path))
if label == "VOR" {
// patch ISBN regex as it can't be compiled in Go.
// todo: this needs a fix upstream.
// todo: this needs a fix upstream in api-raml.
// - https://json-schema.org/understanding-json-schema/reference/regular_expressions.html
// - https://github.com/santhosh-tekuri/jsonschema/issues/113
// - https://github.com/elifesciences/api-raml/blob/8e2ffb573b2c3d2e173c38cd8b9625cf2d5740ad/src/misc/isbn.v1.yaml#L6
Expand Down Expand Up @@ -96,15 +106,15 @@ func read_article_data(article_json_path string) Article {
article_json_bytes, err := os.ReadFile(article_json_path)
panic_on_err(err, "reading bytes from path: "+article_json_path)

result := gjson.GetBytes(article_json_bytes, "article.status")
if !result.Exists() {
article_status := gjson.GetBytes(article_json_bytes, "article.status") // "poa", "vor"
if !article_status.Exists() {
panic("'article.status' field in article data not found: " + article_json_path)
}
schema_key := strings.ToUpper(result.String()) // "poa" => "POA"
schema_key := strings.ToUpper(article_status.String()) // "poa" => "POA"

// article-json contains 'journal', 'snippet' and 'article' sections.
// extract just the 'article' from the article data.
result = gjson.GetBytes(article_json_bytes, "article")
result := gjson.GetBytes(article_json_bytes, "article")
if !result.Exists() {
panic("'article' field in article data not found: " + article_json_path)
}
Expand Down Expand Up @@ -175,16 +185,6 @@ func validate_article(schema_map map[string]Schema, article Article, capture_err
return r
}

func (r Result) String() string {
// "VOR valid in 2.6ms: elife-09560-v1.xml.json"
// "POA invalid in 123.4ms: elife-09560-v1.xml.json"
msg := "%s %s in\t%4dms: %s"
if r.Success {
return fmt.Sprintf(msg, r.Type, "valid", r.Elapsed, r.FileName)
}
return fmt.Sprintf(msg, r.Type, "invalid", r.Elapsed, r.FileName)
}

func format_ms(ms int64) string {
elapsed_str := fmt.Sprintf("%dms", ms)
if ms > 1000 {
Expand Down Expand Up @@ -257,11 +257,11 @@ func process_files_with_feeder(buffer_size int, num_workers int, file_list []str

func do() {
schema_root_ptr := flag.String("schema-root", "", "path to api-raml schema root")
input_path_ptr := flag.String("article-json", "", "path to a article-json file or directory")
input_path_ptr := flag.String("article-json", "", "path to an article-json file or directory")
sample_size_ptr := flag.Int("sample-size", -1, "number of article-json files to parse")
num_workers_ptr := flag.Int("num-workers", 0, "number of workers (goroutines) to process the article-json files\n0 for number of cpu cores, -1 for unbounded")
num_workers_ptr := flag.Int("num-workers", 0, "number of workers (goroutines) to process the article-json files\n0 for number of cpu cores (default), -1 for unbounded")
// 1k articles is about ~1.5GiB of RAM
buffer_size_ptr := flag.Int("buffer-size", 2000, "the maximum number of article-json files to keep in memory at once")
buffer_size_ptr := flag.Int("buffer-size", 1000, "maximum number of article-json files to keep in memory at once")
flag.Parse()

schema_root := *schema_root_ptr
Expand Down Expand Up @@ -304,20 +304,26 @@ func do() {
sample_size = len(path_list)
}

// sort files smallest to highest (asc).
// sort files by filename, numerically, lowest to highest (asc).
// order of file listings is never guaranteed so sort before we take a sample.
// note! filename output happens in parallel so it may appear unordered.
// note! filename output happens in parallel so progress may *appear* unordered.
sort.Slice(path_list, func(a, b int) bool {
return path_list[a].Name() < path_list[b].Name()
})

// filter any directories
file_list := []string{}
for i := 0; i < sample_size; i++ {
path := path_list[i]
// remove any directories
if path.IsDir() {
continue
}

// remove any non-json files
if filepath.Ext(path.Name()) != ".json" {
continue
}

file_list = append(file_list, filepath.Join(input_path, path.Name()))
}

Expand Down
6 changes: 6 additions & 0 deletions manage.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ if test "$cmd" = "build"; then
-v
exit 0

elif test "$cmd" = "clean"; then
# validate-article-json - generated by Go because of go.mod
# linux-amd64* linux-arm64* - generated by the 'release' command.
rm -fv validate-article-json linux-amd64* linux-arm64*
exit 0

elif test "$cmd" = "release"; then
# GOOS is 'Go OS' and is being explicit in which OS to build for.
# CGO_ENABLED=0 skips CGO and linking against glibc to build static binaries.
Expand Down

0 comments on commit e6521fb

Please sign in to comment.