Skip to content

Commit

Permalink
Update generate_rdf.py
Browse files Browse the repository at this point in the history
I used the tqdm function as suggested in david4096#24
  • Loading branch information
Edwin-Ong-Jun-Kiat authored Nov 7, 2024
1 parent 0b320c6 commit 514210d
Showing 1 changed file with 18 additions and 20 deletions.
38 changes: 18 additions & 20 deletions huggingface_rdf/generate_rdf.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
from rdflib import Graph, Namespace, URIRef, Literal
import json
import logging
from tqdm import tqdm

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


def show_progress(current, total):
"""Display progress of completion."""
progress = (current / total) * 100
print(f"Progress: {current}/{total} ({progress:.2f}%)")

def convert_to_rdf(data, output_file,base="http://fakebase"):
def chunk_data(data, chunk_size):
"""Chunking data"""
for i in range(0, len(data), chunk_size):
yield data[i:i + chunk_size]
def convert_to_rdf(data, output_file,base="http://fakebase",chunk_size=10):
"""
This function takes a JSON-serializable data structure, converts it to RDF using
JSON-LD format, and serializes it into Turtle format, saving it to the specified file.
Expand All @@ -24,21 +25,18 @@ def convert_to_rdf(data, output_file,base="http://fakebase"):
Returns:
str: A string representation of the RDF graph in Turtle format.
"""
json_ld_data = json.dumps(data)

total_items = len(data) if isinstance(data, list) else 1
logging.info(f"Starting RDF conversion. Total items: {total_items}")

for i, item in enumerate(data if isinstance(data, list) else [data], start=1):
show_progress(i, total_items)

g = Graph().parse(
data=json_ld_data,
format='json-ld',
base=URIRef(base)
)
total_items = len(data)
logging.info(f"Starting RDF conversion. Total items: {total_items}, Chunk size: {chunk_size}")

g = Graph()

with tqdm(total=total_items, desc="Parsing data") as pbar:
for chunk in chunk_data(data, chunk_size):
for item in chunk:
item_json_ld = json.dumps(item)
g.parse(data=item_json_ld, format='json-ld', base=URIRef(base))
pbar.update(1)

logging.info(f"RDF data successfully saved to {output_file}")
# Implementation for generating RDF
return g.serialize(destination=output_file, format='ttl')

0 comments on commit 514210d

Please sign in to comment.