From d8cf63e6951df5a2034dec3ecee720f50ba03d09 Mon Sep 17 00:00:00 2001 From: Marianna Date: Thu, 23 Feb 2023 11:58:09 +0300 Subject: [PATCH] initial commit --- datasets/chemrxiv/README.md | 8 +++ datasets/chemrxiv/chemrxiv_scraper.py | 67 +++++++++++++++++ datasets/chemrxiv/dl_chemrxiv.py | 32 +++++++++ datasets/chemrxiv/parse_ocr.py | 27 +++++++ datasets/chemrxiv/parse_pdfs.py | 100 ++++++++++++++++++++++++++ datasets/chemrxiv/requirements.txt | 10 +++ 6 files changed, 244 insertions(+) create mode 100644 datasets/chemrxiv/README.md create mode 100644 datasets/chemrxiv/chemrxiv_scraper.py create mode 100644 datasets/chemrxiv/dl_chemrxiv.py create mode 100644 datasets/chemrxiv/parse_ocr.py create mode 100644 datasets/chemrxiv/parse_pdfs.py create mode 100644 datasets/chemrxiv/requirements.txt diff --git a/datasets/chemrxiv/README.md b/datasets/chemrxiv/README.md new file mode 100644 index 0000000..be96bd1 --- /dev/null +++ b/datasets/chemrxiv/README.md @@ -0,0 +1,8 @@ +# Chemrxiv dataset + +### Download from 🤗 [HF Hub](https://huggingface.co/datasets/marianna13/chemrxiv) + +### Dataset building & processing steps: +- Scrape metadata & PDF urls from [website](https://chemrxiv.org/engage/chemrxiv/public-dashboard) +- Download PDFs +- Parse PDFs using pyMuPDF, LayoutParser & Tesseract \ No newline at end of file diff --git a/datasets/chemrxiv/chemrxiv_scraper.py b/datasets/chemrxiv/chemrxiv_scraper.py new file mode 100644 index 0000000..a9eb038 --- /dev/null +++ b/datasets/chemrxiv/chemrxiv_scraper.py @@ -0,0 +1,67 @@ +import requests +import json +import pandas as pd +from multiprocessing.pool import ThreadPool +import os + + +def parse_item(item): + keys = ['id', 'title', 'abstract', 'keywords', 'origin', + 'submittedDate', 'mainCategory', 'asset', 'authors', 'citationsCount'] + parsed_item = {k: item[k] for k in keys} + parsed_item['mainCategory'] = parsed_item['mainCategory']['name'] + parsed_item['asset'] = parsed_item['asset']['original']['url'] + parsed_item['authors'] = [a['firstName']+' '+a['lastName'] + for a in parsed_item['authors']] + return item['id'], parsed_item + + +def get_data(skip, output_dir): + headers = { + 'accept': 'application/json, text/plain, */*', + 'content-type': 'application/json', + 'accept-encoding': 'gzip, deflate, br', + 'accept-language': 'en-US,en;q=0.9,', + 'cookie': 'orp_chemrxiv_sess=s%3Ac3TzBf5JLpLxI_BXEZTSSqS4zbcVmzBS.E0pCSjlLpqSHlNLRZ4NxV8croXlN%2BS8%2BTkriBClGpKA; _ga=GA1.2.204378878.1676638482; _gid=GA1.2.1606288349.1676638482; ln_or=eyIxMzQ4MzQ4IjoiZCJ9; _hjFirstSeen=1; _hjIncludedInSessionSample_2726500=0; _hjSession_2726500=eyJpZCI6IjhhZTgzZDc2LWI0MWMtNDE1NS05YmJiLWNmZTNjZGQxNjkzNSIsImNyZWF0ZWQiOjE2NzY2Mzg0ODE5NzQsImluU2FtcGxlIjpmYWxzZX0=; _hjIncludedInPageviewSample=1; _hjAbsoluteSessionInProgress=0; ELOQUA=GUID=F9383A9B1E5F49259F6D63A4C2634226; site24x7rumID=9870242232781788.1676638516955.1676638516959; _hjSessionUser_2726500=eyJpZCI6ImM3Yzc4MDZmLTRmNDYtNTUxNS04NWY0LWNkZjM1NjExZmI4MSIsImNyZWF0ZWQiOjE2NzY2Mzg0ODE5NDksImV4aXN0aW5nIjp0cnVlfQ==', + 'origin': 'https://chemrxiv.org', + 'referer': 'https://chemrxiv.org/engage/chemrxiv/search-dashboard', + 'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': ''"Windows", + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-origin', + 'user-agent': f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', + 'x-api-key': 'y6nWHrymZysXc' + } + + req_json = { + 'query': "query searchDashboardPageLoad(\n $text: String = \"\",\n $subjects: [String!],\n $categories: [String!],\n $events: [String!],\n $publishedDates: [String!],\n $partners: [String!],\n $contents: [String!],\n $keywords: [String!],\n $authors: String = \"\",\n $skip: Int = 0,\n $limit: Int = 10,\n $sortBy: SortByEnum = RELEVANT_DESC\n ) {\n viewer {\n usageEventsDisabled\n\n user {\n ...userRoleFragment\n }\n\n searchItems(\n searchTerm: $text,\n subjectKeys: $subjects,\n categoryKeys: $categories,\n eventKeys: $events,\n publishedDateKeys: $publishedDates,\n partnerKeys: $partners,\n contentTypeKeys: $contents,\n keywordsKeys: $keywords,\n searchAuthor: $authors,\n skip: $skip,\n limit: $limit,\n sortBy: $sortBy,\n includeBuckets: true\n ) {\n totalCount\n\n results: itemHits {\n highlight {\n text\n matchPositions {\n start\n end\n }\n }\n\n item {\n ...itemMatchFragment\n }\n }\n\n subjectBuckets {\n ...searchBucketFragment\n }\n\n categoryBuckets {\n ...searchBucketFragment\n }\n\n eventBuckets {\n ...searchBucketFragment\n }\n\n partnerBuckets {\n ...searchBucketFragment\n }\n\n publishedDateBuckets {\n ...searchBucketFragment\n }\n\n contentBuckets: contentTypeBuckets {\n ...searchBucketFragment\n }\n\n dateBuckets: publishedDateBuckets {\n ...searchBucketFragment\n }\n }\n\n subjectTypes: subjects {\n ...subjectTypeFragment\n }\n\n contentTypes {\n ...contentTypeFragment\n }\n\n categoryTypes: categories {\n ...categoryTypeFragment\n }\n }\n}\n\nfragment userRoleFragment on User {\n __typename\n id\n sessionExpiresAt\n titleTypeId: title\n firstName\n lastName\n emailAddress : email\n orcid\n roles\n accountType\n}\n\nfragment itemMatchFragment on MainItem {\n __typename\n id\n title\n abstract\n keywords\n origin\n submittedDate\n subjectType: subject {\n ...subjectTypeFragment\n }\n contentType {\n ...contentTypeFragment\n }\n categoryTypes: categories {\n ...categoryTypeFragment\n }\n mainCategory {\n name\n }\n asset{\n mimeType\n original{\n url\n }\n }\n authors {\n title\n firstName\n lastName\n authorConfirmationId\n displayOrder\n }\n metrics {\n metricType\n description\n value\n unit\n }\n citationsCount\n community {\n id\n name\n }\n}\n\nfragment searchBucketFragment on SearchBucket {\n __typename\n count\n key\n label\n}\n\nfragment subjectTypeFragment on Subject {\n __typename\n id\n name\n description\n}\n\nfragment contentTypeFragment on ContentType {\n __typename\n id\n name\n allowSubmission\n allowJournalSubmission\n allowCommunitySubmission\n allowResearchDirectionSubmission\n videoAllowedCheck\n allowedFileTypes\n allowedVideoFileTypes\n}\n\nfragment categoryTypeFragment on Category {\n __typename\n id\n name\n description\n parentId\n}\n", + 'variables': { + 'categories': [], + 'contents': [], + 'events': [], + 'keywords': [], + 'partners': [], + 'publishedDates': [], + 'skip': skip, + 'subjects': [] + } + } + + url = 'https://chemrxiv.org/engage/api-gateway/chemrxiv/graphql' + res = requests.post(url, headers=headers, json=req_json) + data = res.json()['data']['viewer']['searchItems']['results'] + items = {parse_item(d['item'])[0]: parse_item(d['item'])[1] for d in data} + df = pd.read_json(json.dumps(items), orient='index') + df.to_parquet(f'{output_dir}/{skip}.parquet', index=False) + + +if __name__ == '__main__': + + output_dir = 'CHEMRXIV' + os.makedirs(output_dir, exist_ok=True) + skips = [i for i in range(0, 16531, 10)] + num_procs = 5 + with ThreadPool(num_procs) as p: + p.starmap(get_data, [(skip, output_dir) for skip in skips]) diff --git a/datasets/chemrxiv/dl_chemrxiv.py b/datasets/chemrxiv/dl_chemrxiv.py new file mode 100644 index 0000000..8e3b5c9 --- /dev/null +++ b/datasets/chemrxiv/dl_chemrxiv.py @@ -0,0 +1,32 @@ +import os +import pandas as pd +from multiprocessing.pool import ThreadPool +import requests + + +def download(URL, out_dir): + headers = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36', + } + doc = requests.get(URL, headers=headers) + file_name = URL.split('/')[-1] + file_name = f'{out_dir}/{file_name}' + try: + with open(file_name, 'wb') as f: + f.write(doc.content) + except: + print(URL) + + +if __name__ == '__main__': + output_dir = 'CHEMRXIV_pdfs' + os.makedirs(output_dir, exist_ok=True) + df = pd.read_parquet('chemrxiv.parquet')['asset'] + + done_urls = os.listdir('CHEMRXIV_pdfs') + + urls = df.values + + num_proc = 10 + with ThreadPool(num_proc) as p: + p.starmap(download, [(url, output_dir) for url in urls]) diff --git a/datasets/chemrxiv/parse_ocr.py b/datasets/chemrxiv/parse_ocr.py new file mode 100644 index 0000000..cceae65 --- /dev/null +++ b/datasets/chemrxiv/parse_ocr.py @@ -0,0 +1,27 @@ +import cv2 +import layoutparser as lp +import pytesseract +import pandas as pd +import numpy as np + + +model = lp.models.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config', + extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],) + + +def detect_ocr(pix): + langs = 'eng' + pix = cv2.imdecode(np.frombuffer(pix.tobytes('jpg'), np.uint8), -1) + pix = cv2.resize(pix, None, fx=1.5, fy=1.5) + pix = pix[..., ::-1] + layout = model.detect(pix) + text_blocks = lp.Layout([b for b in layout if b.type != 'Figure']) + text_list = [] + for block in text_blocks: + segment_image = (block + .pad(left=5, right=5, top=5, bottom=5) + .crop_image(pix)) + text = pytesseract.image_to_string( + segment_image, lang=langs, config='--oem 1') + text_list.append(text) + return '\n'.join(text_list) diff --git a/datasets/chemrxiv/parse_pdfs.py b/datasets/chemrxiv/parse_pdfs.py new file mode 100644 index 0000000..db6bce0 --- /dev/null +++ b/datasets/chemrxiv/parse_pdfs.py @@ -0,0 +1,100 @@ +import os +import pandas as pd +from tqdm import tqdm +import fitz +import multiprocessing as mp +from parse_ocr import detect_ocr +import glob + + +def get_text(doc): + text_list = [] + + doc = fitz.open(stream=doc) + for i in range(doc.page_count): + page = doc.load_page(i) + text = page.get_text() + if text != '': + text_list.append(text) + else: + text = detect_ocr(page.get_pixmap()) + text_list.append(text) + if len(text_list) > 0: + return '\n'.join(text_list) + + +def process_part(files, output_dir, st): + i = 0 + data = { + 'TEXT': [], + 'SOURCE': [] + } + for obj in tqdm(files): + key = obj.key + try: + body = obj.get()['Body'].read() + text = get_text(body) + if text is not None: + data['TEXT'].append(text) + data['SOURCE'].append(key) + i += 1 + if i > 0 and i % 10 == 0: + pd.DataFrame(data).to_parquet( + f'{output_dir}/{st}_{i}.parquet', index=False) + data = { + 'TEXT': [], + 'SOURCE': [] + } + except: + # print(key) + continue + + +def process_part_files(files, output_dir, st): + i = 0 + data = { + 'TEXT': [], + 'SOURCE': [] + } + for f in tqdm(files): + key = f + try: + with open(f, 'rb') as f_handle: + body = f_handle.read() + text = get_text(body) + if text is not None: + data['TEXT'].append(text) + data['SOURCE'].append(key) + i += 1 + if i > 0 and i % 10 == 0: + pd.DataFrame(data).to_parquet( + f'{output_dir}/{st}_{i}.parquet', index=False) + data = { + 'TEXT': [], + 'SOURCE': [] + } + except Exception as err: + print(err) + os.remove(f) + continue + + +if __name__ == '__main__': + output_dir = 'chemrxiv_text' + os.makedirs(output_dir, exist_ok=True) + files = glob.glob('CHEMRXIV_pdfs/*.pdf') + N = len(files) + print(N) + processes = [] + num_process = 10 + rngs = [(i*int(N/num_process), (i+1)*int(N/num_process)) + for i in range(num_process)] + print(rngs) + for rng in rngs: + start, end = rng + p = mp.Process(target=process_part_files, args=[ + files[start:end], output_dir, start]) + p.start() + processes.append(p) + for p in processes: + p.join() diff --git a/datasets/chemrxiv/requirements.txt b/datasets/chemrxiv/requirements.txt new file mode 100644 index 0000000..edacc4e --- /dev/null +++ b/datasets/chemrxiv/requirements.txt @@ -0,0 +1,10 @@ +requests +pandas +layoutparser +opencv-python +tqdm +pytesseract +pymupdf +torch +torchvision +"git+https://github.com/facebookresearch/detectron2.git@v0.5#egg=detectron2" \ No newline at end of file