|
| 1 | +import os |
| 2 | +import re |
| 3 | +import heapq |
| 4 | +import numpy as np |
| 5 | +from collections import defaultdict |
| 6 | +from typing import List, Dict, Tuple, Union |
| 7 | +from nltk.tokenize import sent_tokenize |
| 8 | +from sklearn.feature_extraction.text import TfidfVectorizer |
| 9 | +from sklearn.cluster import KMeans |
| 10 | +import spacy |
| 11 | +from nltk.stem import PorterStemmer |
| 12 | + |
| 13 | + |
| 14 | +class TextSummarization: |
| 15 | + def __init__(self): |
| 16 | + try: |
| 17 | + self.nlp = spacy.load('en_core_web_sm') |
| 18 | + except OSError: |
| 19 | + print("Downloading spaCy model...") |
| 20 | + os.system("python -m spacy download en_core_web_sm") |
| 21 | + self.nlp = spacy.load('en_core_web_sm') |
| 22 | + |
| 23 | + self.tfidf_vectorizer = TfidfVectorizer(stop_words='english') |
| 24 | + self.stemmer = PorterStemmer() |
| 25 | + |
| 26 | + def clean_text(self, text: str) -> Tuple[str, str]: |
| 27 | + """Clean and preprocess the text, returning both original and stemmed versions.""" |
| 28 | + text = re.sub(r'[^\w\s.,!?]', '', text) # Keep sentence structure |
| 29 | + cleaned_text = ' '.join(text.split()) # Remove extra whitespace |
| 30 | + stemmed_text = self.stem_text(cleaned_text) |
| 31 | + return cleaned_text, stemmed_text |
| 32 | + |
| 33 | + def stem_text(self, text: str) -> str: |
| 34 | + """Stem the words in the text.""" |
| 35 | + words = text.split() |
| 36 | + stemmed_words = [self.stemmer.stem(word) for word in words] |
| 37 | + return ' '.join(stemmed_words) |
| 38 | + |
| 39 | + def score_sentences(self, original_sentences: List[str], stemmed_sentences: List[str]) -> Dict[str, float]: |
| 40 | + """Score sentences based on TF-IDF and structural features.""" |
| 41 | + tfidf_matrix = self.tfidf_vectorizer.fit_transform(stemmed_sentences) |
| 42 | + sentence_scores = defaultdict(float) |
| 43 | + |
| 44 | + for i, original_sentence in enumerate(original_sentences): |
| 45 | + score = sum(tfidf_matrix[i, j] for j in tfidf_matrix[i].indices) |
| 46 | + sent_doc = self.nlp(original_sentence) |
| 47 | + |
| 48 | + # Apply length and positional weighting |
| 49 | + length_factor = min( |
| 50 | + 1.0, len(sent_doc) / 20.0) if len(sent_doc) < 20 else 20.0 / len(sent_doc) |
| 51 | + score *= length_factor |
| 52 | + |
| 53 | + # Position bonuses |
| 54 | + if i < len(original_sentence) * 0.2: |
| 55 | + score *= 1.2 |
| 56 | + elif i > len(original_sentence) * 0.8: |
| 57 | + score *= 1.1 |
| 58 | + |
| 59 | + # Bonuses for named entities and important dependencies |
| 60 | + if sent_doc.ents: |
| 61 | + score *= 1.2 |
| 62 | + if any(token.dep_ in ['nsubj', 'dobj'] for token in sent_doc): |
| 63 | + score *= 1.1 |
| 64 | + |
| 65 | + sentence_scores[original_sentence] = score |
| 66 | + |
| 67 | + return sentence_scores |
| 68 | + |
| 69 | + def extract_key_points(self, original_sentences: List[str], stemmed_sentences: List[str], num_clusters: int = 5) -> List[str]: |
| 70 | + """Extract key points using K-means clustering.""" |
| 71 | + num_clusters = min(num_clusters, len(original_sentences)) |
| 72 | + if num_clusters < 1: |
| 73 | + return [] |
| 74 | + |
| 75 | + tfidf_matrix = self.tfidf_vectorizer.fit_transform(stemmed_sentences) |
| 76 | + kmeans = KMeans(n_clusters=num_clusters, random_state=42) |
| 77 | + kmeans.fit(tfidf_matrix) |
| 78 | + |
| 79 | + labeled_sentences = [(orig, stem, label, idx) for idx, (orig, stem, label) in enumerate( |
| 80 | + zip(original_sentences, stemmed_sentences, kmeans.labels_))] |
| 81 | + key_points = [] |
| 82 | + |
| 83 | + for cluster in range(num_clusters): |
| 84 | + cluster_sentences = [ |
| 85 | + item for item in labeled_sentences if item[2] == cluster] |
| 86 | + if cluster_sentences: |
| 87 | + cluster_center = kmeans.cluster_centers_[cluster] |
| 88 | + distances = [np.linalg.norm(tfidf_matrix[item[3]].toarray( |
| 89 | + ) - cluster_center) for item in cluster_sentences] |
| 90 | + closest_sentence = cluster_sentences[np.argmin( |
| 91 | + distances)][0] # Use original sentence |
| 92 | + |
| 93 | + sent_doc = self.nlp(closest_sentence) |
| 94 | + if len(sent_doc) >= 5: |
| 95 | + point = re.sub(r'\s+', ' ', closest_sentence.strip('., ')) |
| 96 | + if len(point.split()) >= 5: |
| 97 | + # Store with original index |
| 98 | + key_points.append((point, cluster_sentences[0][3])) |
| 99 | + |
| 100 | + # Sort key points based on their original position in the text |
| 101 | + key_points.sort(key=lambda x: x[1]) |
| 102 | + return [point for point, _ in key_points] |
| 103 | + |
| 104 | + def summarize(self, text: str, num_sentences: int = 5) -> Dict[str, Union[str, List[str]]]: |
| 105 | + """Generate a comprehensive summary of the text.""" |
| 106 | + cleaned_text, stemmed_text = self.clean_text(text) |
| 107 | + original_sentences = sent_tokenize(cleaned_text) |
| 108 | + stemmed_sentences = sent_tokenize(stemmed_text) |
| 109 | + num_sentences = min(num_sentences, len( |
| 110 | + original_sentences)) if original_sentences else 0 |
| 111 | + |
| 112 | + sentence_scores = self.score_sentences( |
| 113 | + original_sentences, stemmed_sentences) |
| 114 | + summary_sentences = heapq.nlargest( |
| 115 | + num_sentences, sentence_scores.items(), key=lambda x: x[1]) |
| 116 | + summary_sentences.sort(key=lambda x: original_sentences.index(x[0])) |
| 117 | + |
| 118 | + summary = ' '.join([sentence for sentence, _ in summary_sentences]) |
| 119 | + key_points = self.extract_key_points( |
| 120 | + original_sentences, stemmed_sentences, num_clusters=min(5, len(original_sentences))) |
| 121 | + |
| 122 | + return { |
| 123 | + 'summary': summary, |
| 124 | + 'key_points': key_points, |
| 125 | + } |
| 126 | + |
| 127 | + |
| 128 | +def main(text): |
| 129 | + # Create summarizer instance |
| 130 | + summarizer = TextSummarization() |
| 131 | + |
| 132 | + # Generate summary |
| 133 | + summary = summarizer.summarize(text) |
| 134 | + |
| 135 | + print("Summary : ") |
| 136 | + print(summary['summary']) |
| 137 | + |
| 138 | + |
| 139 | + print("------\n") |
| 140 | + |
| 141 | + print("Key Points : ") |
| 142 | + for i,j in enumerate(summary["key_points"]): |
| 143 | + print(f"{i+1}. {j}") |
| 144 | + |
| 145 | + print("------") |
| 146 | + |
| 147 | + |
| 148 | + |
| 149 | +if __name__ == "__main__": |
| 150 | + main("""NLP is a subfield of computer science and artificial intelligence concerned with interactions between computers and human (natural) languages. It is used to apply machine learning algorithms to text and speech. |
| 151 | +
|
| 152 | +For example, we can use NLP to create systems like speech recognition, document summarization, machine translation, spam detection, named entity recognition, question answering, autocomplete, predictive typing and so on. |
| 153 | +
|
| 154 | +Nowadays, most of us have smartphones that have speech recognition. These smartphones use NLP to understand what is said. Also, many people use laptops which operating system has a built-in speech recognition.""") |
0 commit comments