-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquesans.py
113 lines (82 loc) · 4.87 KB
/
quesans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import spacy
from transformers import pipeline
# import subprocess
# try:
# subprocess.run(
# [
# "python",
# "-m",
# "pip",
# "install",
# "--user",
# "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0.tar.gz",
# ],
# check=True,
# )
# print("Model installed successfully with --user!")
# except subprocess.CalledProcessError as e:
# print(f"Error during installation: {e}")
# spacy.cli.download("en_core_web_sm")
# Load SpaCy model for NER
model_path = "en_core_web_sm-3.5.0/en_core_web_sm/en_core_web_sm-3.5.0/"
nlp_spacy = spacy.load(model_path)
# Load the question generation model
nlp_qg = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")
# The model valhalla/t5-base-qg-hl is a pre-trained model hosted on the Hugging Face Model Hub. It's a variant of the T5 (Text-To-Text Transfer Transformer) model fine-tuned for Question Generation (QG) tasks.
# Load the question answering model
nlp_qa = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
# The distilbert-base-cased-distilled-squad model is a pre-trained model available on the Hugging Face Model Hub. It is a distilled version of the BERT (Bidirectional Encoder Representations from Transformers) model, specifically fine-tuned for the SQuAD (Stanford Question Answering Dataset) task.
# DistilBERT is a smaller, faster, cheaper, and lighter version of BERT. It is created using a process called knowledge distillation, where a smaller model (the student) is trained to replicate the behavior of a larger model (the teacher), in this case, BERT. DistilBERT retains 97% of BERT's language understanding capabilities while being 60% faster and 40% smaller in size.
# This specific model, distilbert-base-cased-distilled-squad, is a variant of DistilBERT that has been fine-tuned on the SQuAD dataset, which is a large-scale dataset for question answering tasks.
def highlight_text(text, entity):
highlighted_text = text.replace(entity, f"<hl>{entity}<hl>")
return highlighted_text
# Example input text
sample_text = ("The symptoms of COVID19 are variable but often include fever, fatigue, cough, "
"breathing difficulties, loss of smell, and loss of taste. Symptoms may begin one "
"to fourteen days after exposure to the virus. At least a third of people who are "
"infected do not develop noticeable symptoms. Of those who develop symptoms "
"noticeable enough to be classified as patients, most (81%) develop mild to moderate "
"symptoms (up to mild pneumonia), while 14 develop severe symptoms (dyspnea, hypoxia, "
"or more than 50 lung involvement on imaging), and 5 develop critical symptoms "
"(respiratory failure, shock, or multiorgan dysfunction). Older people are at a higher "
"risk of developing severe symptoms. Some complications result in death. Some people "
"continue to experience a range of effects (long COVID) for months or years after "
"infection, and damage to organs has been observed. Multi-year studies are underway "
"to further investigate the long-term effects of the disease.")
def helper_tool(text, no_of_ques):
# Use SpaCy to identify named entities in the text
doc = nlp_spacy(text)
# entities = [ent.text for ent in doc.ents]
entities = []
for ent in doc.ents:
if ent.text not in entities:
entities.append(ent.text)
print(entities)
# Set the number of questions
no_of_questions = no_of_ques
# Generate questions by highlighting different entities
questions_and_answers = []
for entity in entities:
highlighted_text = highlight_text(text, entity)
result = nlp_qg(highlighted_text, max_new_tokens=50)
question = result[0]['generated_text']
# Get the answer for the generated question
answer = nlp_qa(question=question, context=text)['answer']
questions_and_answers.append((question, answer))
if len(questions_and_answers) >= no_of_questions:
break
# If not enough questions, generate more variations
if len(questions_and_answers) < no_of_questions:
for _ in range(no_of_questions - len(questions_and_answers)):
highlighted_text = highlight_text(text, entities[0])
result = nlp_qg(highlighted_text, max_new_tokens=50)
question = result[0]['generated_text']
answer = nlp_qa(question=question, context=text)['answer']
questions_and_answers.append((question, answer))
# Print the generated questions and answers
for i, (question, answer) in enumerate(questions_and_answers[:no_of_questions]):
print(f"Question {i+1}: {question}")
print(f"Answer {i+1}: {answer}\n")
return questions_and_answers
# helper_tool(sample_text, 2)