GitHub

#Project-1

The project is to generate N-grams from the 5 Blockchain related papers.
From the document we need to generate 5 sentences with 10 words in n-grams format
Install packages from requirements.txt file
Open jupyter lab and create a new notebook
Imported the nltk package and downloaded few more packages from nltk

###Code for Project 1

####Create virtual environment

python3 -m venv .env would create a (hidden) directory called .env under the current directory. Then, update pip using

Upgrade pip python3 -m pip install --upgrade pip

Then install the packages from requirements.txt as in pip install -r requirements.txt

####Code

import nltk
import random as rand

# Execute the below lines ones
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

num_of_sentences = 5
num_of_words = 10

# Read the file and preprocess the text
filename = "/Users/chathurya/Downloads/papers.txt"

with open(filename, 'r', encoding = "ISO-8859-1") as f:
    content = f.readlines()

content = [line.strip() for line in content if line.strip() != '']
tokens = [word.lower() for line in content for word in nltk.word_tokenize(line)]
refined_tokens = remove_punctuation(tokens)
words = [word for word in refined_tokens if word.isalpha()]
print(words[:50])

In the above code we are importing nltk and random packages along with some more packages from nltk. After that read a file and preprocess it. next we are converting the text into lowercase. Tokenizing each word and removing punctuations. printing first 50 words.

# Generate 5 sentences with 10 words 
unigrams = []
freq_unigrams = nltk.FreqDist(words)
for token_ in freq_unigrams.keys():
    unigrams.append(token_)
length = len(words)
for i in range(num_of_sentences):
    sentence = []
    tokens_sentence = []
    
    for j in range(num_of_words):
        word_ = []
        index = rand.randint(20, length) - 10
        sentence.append(words[index+j])
        word_.append(words[index+j])
        tokens_sentence.append(tuple(word_))
    print(" ".join(sentence))
    print(tokens_sentence)

Creating an empty list unigrams, using nltk freqdist finding the most frequent words and looping through the keys and appending into empty list. we are looping through the sentences and words, appending the randomly picked words.

# generate 5 sentences with 10 words with bigram
print("BIGRAM:\n")
bigrams = list(nltk.bigrams(words))
freq_bigrams = nltk.FreqDist(bigrams)
bigrams = []
for token_ in freq_bigrams.keys():
    bigrams.append(token_)
for i in range(num_of_sentences):
    sentence = []
    tokens_sentence = []
    for j in range(num_of_words):
        if j == 0:
            # Choose the first word randomly
            word = random.choice(words)
        else:
            # Choose the next word based on the previous word
            prev_word = sentence[-1]
#             choices = [n[1] for n in bigrams if n[0] == prev_word]
            choices = []
            token_choices = []
            for n in bigrams:
                if n[0] == prev_word:
                    choices.append(n[1])
                    token_choices.append(n)
            if not choices:
                # if there are no choices, then pick a random word
                word = random.choice(words)
                
            else:
                 # get a random word from the choices
                word = random.choice(choices)
                tokens_sentence.append(token_choices[choices.index(word)])
        sentence.append(word)
    print(" ".join(sentence))
    print(tokens_sentence)
    print()

Creating bigrams from the file the first word is picked randomly and the second word is picked as the most frequent word of the first word. we here appending them into the sentences.

# generate 5 sentences with 10 words with trigram
print("TRIGRAM:\n")
trigrams = list(nltk.trigrams(words))
trigrams = nltk.FreqDist(trigrams)
freq_trigrams = nltk.FreqDist(trigrams)
trigrams = []
for token_ in freq_trigrams.keys():
    trigrams.append(token_)
for i in range(num_of_sentences):
    sentence = []
    tokens_sentence = []
    for j in range(num_of_words):
        if j == 0:
            # Choose the first two words randomly
            word = rand.choice(words)
            sentence.append(word)
        elif j == 1:
            prev_words = sentence[-1]
#             print("Prev word for biugram check",prev_words)
            for n in bigrams:
                if n[0] == prev_words:
                    sentence.append(n[1])
                    break    
        else:
            # Choose the next word based on the previous two words
            prev_words = tuple(sentence[-2:])
#             print("Prev words", prev_words)
#             choices = [n[2] for n in trigrams if n[:2] == prev_words]
            choices = False
            for n in trigrams:
                if n[:2] == prev_words:
                    choices = True
                    sentence.append(n[2])
                    tokens_sentence.append(n)
                    break

            if not choices:
                # if there are no choices, then pick a random word
                word = rand.choice(words)
                tokens_sentence.append(word)
    print(" ".join(sentence))
    print(tokens_sentence)
    print()

Generating trigrams from file, first word should be randomly picked and the second word is taken from bigrams, third word is the most frequent word followed by the second word and the fourth will be the most frequent word of the both second and third word.

# generate 5 sentences with 10 words with N-gram
print("4-GRAM:\n")
fourgrams = list(nltk.ngrams(words, 4))
fourgrams = nltk.FreqDist(fourgrams)
freq_fourgrams = nltk.FreqDist(fourgrams)
fourgrams = []
for token_ in freq_fourgrams.keys():
    fourgrams.append(token_)
for i in range(num_of_sentences):
    sentence = []
    tokens_sentence = []
    for j in range(num_of_words):
        if j == 0:
            # Choose the first two words randomly
            word = rand.choice(words)
            sentence.append(word)
        elif j == 1:
            prev_words = sentence[-1]
#             print("Prev word for biugram check",prev_words)
            for n in bigrams:
                if n[0] == prev_words:
                    sentence.append(n[1])
                    break    
        elif j == 2:
            prev_words = tuple(sentence[-2:])
#             print("Prev word for biugram check",prev_words)
            for n in trigrams:
                if n[:2] == prev_words:
                    sentence.append(n[2])
                    break
        else:
            # Choose the next word based on the previous two words
            prev_words = tuple(sentence[-3:])
#             print("Prev words", prev_words)
#             choices = [n[2] for n in trigrams if n[:2] == prev_words]
            choices = False
            for n in fourgrams:
                if n[:3] == prev_words:
                    choices = True
                    sentence.append(n[3])
                    tokens_sentence.append(n)
                    break

            if not choices:
                # if there are no choices, then pick a random word
                word = rand.choice(words)
                tokens_sentence.append(word)

    print(" ".join(sentence))
    print(tokens_sentence)
    print()

Similary we can generate the 4-grams by following the method of trigrams.

Name		Name	Last commit message	Last commit date
Latest commit History 9 Commits
ChathuryaThimmapuram_HW1		ChathuryaThimmapuram_HW1
Project 3.ipynb		Project 3.ipynb
Project-1.ipynb		Project-1.ipynb
Project2.ipynb		Project2.ipynb
README.md		README.md
README3.md		README3.md
Readme.md		Readme.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Repository files navigation

About

Uh oh!

Releases

Packages

Languages

ct0571/NLP

Folders and files

Latest commit

History

Repository files navigation

About

Resources

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages