diff --git a/hw1/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/hw1/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000..2fd6442 --- /dev/null +++ b/hw1/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/hw1/Untitled.ipynb b/hw1/Untitled.ipynb new file mode 100644 index 0000000..b07b4df --- /dev/null +++ b/hw1/Untitled.ipynb @@ -0,0 +1,169 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.672\n", + "2.619\n", + "2.791\n", + "2.746\n", + "0.119\n", + "0.127\n", + "0.030\n", + "0.030\n" + ] + } + ], + "source": [ + "\"\"\"Calculate information-theoretic measures of distributional\n", + "similarity based on word frequencies in two texts\n", + "\"\"\"\n", + "\n", + "import collections\n", + "import math\n", + "\n", + "\n", + "def read_words(infile):\n", + " with open(infile) as input_text:\n", + " return [x.strip() for x in input_text.read().split()]\n", + "\n", + "\n", + "def get_counts(word_list):\n", + " return collections.Counter(word_list)\n", + "\n", + "\n", + "def create_prob_dist(count_dict):\n", + " total_ct = sum(count_dict.values())\n", + " p = {x: ct / total_ct for x, ct in count_dict.items()}\n", + " return p\n", + "\n", + "\n", + "def count_smoothing(freq_dist, vocabulary, alpha=1):\n", + " \"\"\"Implement simple count-based probability smoothing.\n", + " Given a target vocabulary and a set of observed count frequencies,\n", + " calculate a new set of counts so that Count(x) > 0 for all words\n", + " in the target vocabulary. This is achieved by adding `alpha`\n", + " to each observed count\n", + " \"\"\"\n", + " return {w: freq_dist.get(w, 0) + alpha for w in vocabulary}\n", + "\n", + "\n", + "def entropy(p):\n", + " \"\"\"Calculate entropy H(p) for a probability distribution represented\n", + " as a mapping (dictionary) from word tokens to probabilities\n", + " \"\"\"\n", + " h = 0\n", + "\n", + " # TODO -- Calculate entropy value in nats for probability distribution `p`\n", + " for x in p:\n", + " h -= p[x] * math.log(p[x])\n", + "\n", + " return h\n", + "\n", + "\n", + "def cross_entropy(p1, p2):\n", + " \"\"\"Calculate cross-entropy H(p1, p2) for two probability distributions\n", + " represented as a mapping (dictionary) from word tokens to\n", + " probabilities\n", + " \"\"\"\n", + " xh = 0\n", + "\n", + " # TODO -- Calculate cross-entropy value H(p1, p2) in nats\n", + " for x in p1:\n", + " xh -= p1[x] * math.log(p2[x])\n", + "\n", + " return xh\n", + "\n", + "\n", + "def kl_divergence(p1, p2):\n", + " \"\"\"Calculate Kullback-Leibler divergence D_{KL}(p1||p2) for two\n", + " probability distributions represented as a mapping (dictionary)\n", + " from word tokens to probabilities\n", + " \"\"\"\n", + " kl = 0\n", + "\n", + " # TODO -- Calculate KL divergence D_{KL}(p1||p2) in nats\n", + " kl = cross_entropy(p1, p2) - entropy(p1)\n", + "\n", + " return kl\n", + "\n", + "\n", + "def js_divergence(p1, p2):\n", + " \"\"\"Calculate Jensen-Shannon divergence D_{JS}(p1||p2) for two\n", + " probability distributions represented as a mapping (dictionary)\n", + " from word tokens to probabilities\n", + " \"\"\"\n", + " js = 0\n", + "\n", + " # TODO -- Calculate JS divergence D_{JS}(p1||p2) in nats\n", + " m = {k: ((p1.get(k, 0) + p2.get(k, 0))/2.0) for k in p1.keys()} \n", + " js = (kl_divergence(p1, m) + kl_divergence(p2, m))/2\n", + " \n", + " return js\n", + "\n", + "if __name__ == \"__main__\":\n", + " \"\"\"Do not edit this code\n", + " \"\"\"\n", + " words_a = read_words(\"test_a.txt\")\n", + " words_b = read_words(\"test_b.txt\")\n", + "\n", + " ct_a = get_counts(words_a)\n", + " ct_b = get_counts(words_b)\n", + "\n", + " vocab = set(ct_a.keys()) | set(ct_b.keys())\n", + " ct_a = count_smoothing(ct_a, vocab)\n", + " ct_b = count_smoothing(ct_b, vocab)\n", + "\n", + " p_a = create_prob_dist(ct_a)\n", + " p_b = create_prob_dist(ct_b)\n", + "\n", + " h_a = entropy(p_a)\n", + " h_b = entropy(p_b)\n", + " xh_ab = cross_entropy(p_a, p_b)\n", + " xh_ba = cross_entropy(p_b, p_a)\n", + " kl_ab = kl_divergence(p_a, p_b)\n", + " kl_ba = kl_divergence(p_b, p_a)\n", + " js_ab = js_divergence(p_a, p_b)\n", + " js_ba = js_divergence(p_b, p_a)\n", + "\n", + " for metric in [h_a, h_b, xh_ab, xh_ba,\n", + " kl_ab, kl_ba, js_ab, js_ba]:\n", + " print(\"{:.3f}\".format(metric))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/hw1/dist_measures.py b/hw1/dist_measures.py new file mode 100644 index 0000000..d2591f2 --- /dev/null +++ b/hw1/dist_measures.py @@ -0,0 +1,121 @@ +# Chandni Patel +# CS 585 - NLP +# Homework 1 + +"""Calculate information-theoretic measures of distributional +similarity based on word frequencies in two texts +""" + +import collections +import math + + +def read_words(infile): + with open(infile) as input_text: + return [x.strip() for x in input_text.read().split()] + + +def get_counts(word_list): + return collections.Counter(word_list) + + +def create_prob_dist(count_dict): + total_ct = sum(count_dict.values()) + p = {x: ct / total_ct for x, ct in count_dict.items()} + return p + + +def count_smoothing(freq_dist, vocabulary, alpha=1): + """Implement simple count-based probability smoothing. + Given a target vocabulary and a set of observed count frequencies, + calculate a new set of counts so that Count(x) > 0 for all words + in the target vocabulary. This is achieved by adding `alpha` + to each observed count + """ + return {w: freq_dist.get(w, 0) + alpha for w in vocabulary} + + +def entropy(p): + """Calculate entropy H(p) for a probability distribution represented + as a mapping (dictionary) from word tokens to probabilities + """ + h = 0 + + # TODO -- Calculate entropy value in nats for probability distribution `p` + for x in p: + h -= p[x] * math.log(p[x]) + + return h + + +def cross_entropy(p1, p2): + """Calculate cross-entropy H(p1, p2) for two probability distributions + represented as a mapping (dictionary) from word tokens to + probabilities + """ + xh = 0 + + # TODO -- Calculate cross-entropy value H(p1, p2) in nats + for x in p1: + xh -= p1[x] * math.log(p2[x]) + + return xh + + +def kl_divergence(p1, p2): + """Calculate Kullback-Leibler divergence D_{KL}(p1||p2) for two + probability distributions represented as a mapping (dictionary) + from word tokens to probabilities + """ + kl = 0 + + # TODO -- Calculate KL divergence D_{KL}(p1||p2) in nats + # D_{KL}(p1||p2) = H(p1,p2) - H(p1) + kl = cross_entropy(p1, p2) - entropy(p1) + + return kl + + +def js_divergence(p1, p2): + """Calculate Jensen-Shannon divergence D_{JS}(p1||p2) for two + probability distributions represented as a mapping (dictionary) + from word tokens to probabilities + """ + js = 0 + + # TODO -- Calculate JS divergence D_{JS}(p1||p2) in nats + # M = (p1+p2)/2 + m = {k: ((p1.get(k, 0) + p2.get(k, 0))/2.0) for k in p1.keys()} + # D_{JS}(p1||p2) = (D_{KL}(p1||M) + D_{KL}(p2||M))/2 + js = (kl_divergence(p1, m) + kl_divergence(p2, m))/2 + + return js + +if __name__ == "__main__": + """Do not edit this code + """ + words_a = read_words("test_a.txt") + words_b = read_words("test_b.txt") + + ct_a = get_counts(words_a) + ct_b = get_counts(words_b) + + vocab = set(ct_a.keys()) | set(ct_b.keys()) + ct_a = count_smoothing(ct_a, vocab) + ct_b = count_smoothing(ct_b, vocab) + + p_a = create_prob_dist(ct_a) + p_b = create_prob_dist(ct_b) + + h_a = entropy(p_a) + h_b = entropy(p_b) + xh_ab = cross_entropy(p_a, p_b) + xh_ba = cross_entropy(p_b, p_a) + kl_ab = kl_divergence(p_a, p_b) + kl_ba = kl_divergence(p_b, p_a) + js_ab = js_divergence(p_a, p_b) + js_ba = js_divergence(p_b, p_a) + + for metric in [h_a, h_b, xh_ab, xh_ba, + kl_ab, kl_ba, js_ab, js_ba]: + print("{:.3f}".format(metric)) diff --git a/hw1/images/img1.jpg b/hw1/images/img1.jpg new file mode 100644 index 0000000..64f7c2c Binary files /dev/null and b/hw1/images/img1.jpg differ diff --git a/hw1/images/img2.jpg b/hw1/images/img2.jpg new file mode 100644 index 0000000..23e2c5e Binary files /dev/null and b/hw1/images/img2.jpg differ diff --git a/hw1/images/img3.jpg b/hw1/images/img3.jpg new file mode 100644 index 0000000..2227f64 Binary files /dev/null and b/hw1/images/img3.jpg differ diff --git a/hw1/images/img4.jpg b/hw1/images/img4.jpg new file mode 100644 index 0000000..f98488a Binary files /dev/null and b/hw1/images/img4.jpg differ diff --git a/hw1/test.out.txt b/hw1/test.out.txt new file mode 100644 index 0000000..f5b01c7 --- /dev/null +++ b/hw1/test.out.txt @@ -0,0 +1,8 @@ +2.672 +2.619 +2.791 +2.746 +0.119 +0.127 +0.030 +0.030 diff --git a/hw1/test_a.txt b/hw1/test_a.txt new file mode 100644 index 0000000..d0d7baa --- /dev/null +++ b/hw1/test_a.txt @@ -0,0 +1 @@ +the quick brown fox jumped over the lazy dog the spry red dog jumped near the indolent fox the lazy cat never jumped diff --git a/hw1/test_b.txt b/hw1/test_b.txt new file mode 100644 index 0000000..613be66 --- /dev/null +++ b/hw1/test_b.txt @@ -0,0 +1 @@ +the dog and the cat jumped over the fox the red and brown fox jumped over the mouse diff --git a/hw2/hw2a-handout/gen_vocab.sh b/hw2/hw2a-handout/gen_vocab.sh new file mode 100644 index 0000000..650a2e8 --- /dev/null +++ b/hw2/hw2a-handout/gen_vocab.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# Replace this line with a sequence of shell commands connected with Unix pipes ("|") + +# replace spaces with new line to get list of words +# then sort all words and get unique works +# remove empty lines + +tr ' ' '\n' | sort | uniq | sed -r '/^\s*$/d' diff --git a/hw2/hw2a-handout/test.in.txt b/hw2/hw2a-handout/test.in.txt new file mode 100644 index 0000000..a9f363d --- /dev/null +++ b/hw2/hw2a-handout/test.in.txt @@ -0,0 +1,3 @@ +the quick brown fox jumped over the lazy dog the spry red dog jumped near the indolent fox the lazy cat never jumped +the dog and the cat jumped over the fox the red and brown fox jumped over the mouse + quick brown foxes jumped and lazy dogs slept diff --git a/hw2/hw2a-handout/test.out.txt b/hw2/hw2a-handout/test.out.txt new file mode 100644 index 0000000..7acd1b0 --- /dev/null +++ b/hw2/hw2a-handout/test.out.txt @@ -0,0 +1,19 @@ +and +brown +cat +dog +dogs +fox +foxes +indolent +jumped +lazy +mouse +near +never +over +quick +red +slept +spry +the diff --git a/hw2/hw2b-handout/compare_cols.sh b/hw2/hw2b-handout/compare_cols.sh new file mode 100644 index 0000000..f23160e --- /dev/null +++ b/hw2/hw2b-handout/compare_cols.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Replace this line with one or more shell commands +# You may write to intermediate text files on disk if necessary + +# cut and paste columns for comparing +cut -d ',' -f 3,5 > col35.txt +cut -d ',' -f 1 < col35.txt | awk '{print $1}' > col3.txt +cut -d ',' -f 2 < col35.txt > col5.txt +paste -d ' ' col3.txt col5.txt > col35.txt +# compare and add lines +awk '{for(i=2;i<=NF;i++) + { + if($1 == $i) + { + print $1 > "lines.txt"; + break;} + } + }' col35.txt +wc -l lines.txt | tr ' ' '\n' | head -n 1 + + diff --git a/hw2/hw2b-handout/test.in.1.txt b/hw2/hw2b-handout/test.in.1.txt new file mode 100644 index 0000000..ed6ee1c --- /dev/null +++ b/hw2/hw2b-handout/test.in.1.txt @@ -0,0 +1,4 @@ +the quick brown,fox jumped over the lazy,dog the,spry,red dog jumped,near the indolent fox,the lazy cat never jumped +the dog,and the,cat jumped,over the,fox the red and brown,fox jumped,over the mouse +a brown,fox and,a red,dog jumped,over the,mouse and,the cat +the cat,slept and,the dog,jumped over,the mouse,while the fox,slept diff --git a/hw2/hw2b-handout/test.in.2.txt b/hw2/hw2b-handout/test.in.2.txt new file mode 100644 index 0000000..636f7b7 --- /dev/null +++ b/hw2/hw2b-handout/test.in.2.txt @@ -0,0 +1,2 @@ +the quick brown,fox jumped over the lazy,dog the,spry,red dog jumped,near the indolent fox,the lazy cat never jumped +the dog,and the,cat jumped,over the,fox the red and brown,fox jumped,over the mouse diff --git a/hw2/hw2b-handout/test.out.1.txt b/hw2/hw2b-handout/test.out.1.txt new file mode 100644 index 0000000..fb8c52e --- /dev/null +++ b/hw2/hw2b-handout/test.out.1.txt @@ -0,0 +1 @@ + 2 diff --git a/hw2/hw2b-handout/test.out.2.txt b/hw2/hw2b-handout/test.out.2.txt new file mode 100644 index 0000000..5dd69c0 --- /dev/null +++ b/hw2/hw2b-handout/test.out.2.txt @@ -0,0 +1 @@ + 1 diff --git a/hw2/hw2c-handout/count_articles.sh b/hw2/hw2c-handout/count_articles.sh new file mode 100644 index 0000000..6d463d0 --- /dev/null +++ b/hw2/hw2c-handout/count_articles.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# Replace this line with one or more shell commands +# You may write to intermediate text files on disk if necessary + +ls test_*.txt | for file in $(ls test_*.txt) +do +#The second field should be the number of lines in the file that include "the" as a word. +numthe=$(cat $file | grep -c -E '\b[Tt]he\b') +#The third field should be the number of lines in the file that include "a" as a word. +numa=$(cat $file | grep -c -E '\b[Aa]\b') +#The fourth field should be the number of lines in the file that include "an" as a word. +numan=$(cat $file | grep -c -E '\b[Aa]n\b') +echo "$file,$numthe,$numa,$numan" +done + diff --git a/hw2/hw2c-handout/test.out.txt b/hw2/hw2c-handout/test.out.txt new file mode 100644 index 0000000..3387b89 --- /dev/null +++ b/hw2/hw2c-handout/test.out.txt @@ -0,0 +1,3 @@ +test_1.in.txt,2,0,1 +test_2.in.txt,0,2,1 +test_3.in.txt,2,0,0 diff --git a/hw2/hw2c-handout/test_1.in.txt b/hw2/hw2c-handout/test_1.in.txt new file mode 100644 index 0000000..0117acd --- /dev/null +++ b/hw2/hw2c-handout/test_1.in.txt @@ -0,0 +1,2 @@ +the quick brown fox jumped over the lazy dog the spry red dog jumped near the indolent fox the lazy cat never jumped +the dog and an owl jumped over the fox the red and brown fox jumped over the mouse diff --git a/hw2/hw2c-handout/test_2.in.txt b/hw2/hw2c-handout/test_2.in.txt new file mode 100644 index 0000000..3f65bd5 --- /dev/null +++ b/hw2/hw2c-handout/test_2.in.txt @@ -0,0 +1,2 @@ +a quick brown fox jumped over a lazy dog a spry red dog jumped near an indolent fox a lazy cat never jumped +a dog and a cat jumped over a fox a red and brown fox jumped over a mouse diff --git a/hw2/hw2c-handout/test_3.in.txt b/hw2/hw2c-handout/test_3.in.txt new file mode 100644 index 0000000..ee101ce --- /dev/null +++ b/hw2/hw2c-handout/test_3.in.txt @@ -0,0 +1,2 @@ +the quick brown fox jumped over the lazy dog the spry red dog jumped near the indolent fox the lazy cat never jumped +the dog and the cat jumped over the fox the red and brown fox jumped over the mouse diff --git a/hw2/images/img1.jpg b/hw2/images/img1.jpg new file mode 100644 index 0000000..0a5d0b5 Binary files /dev/null and b/hw2/images/img1.jpg differ diff --git a/hw2/images/img2.jpg b/hw2/images/img2.jpg new file mode 100644 index 0000000..348230c Binary files /dev/null and b/hw2/images/img2.jpg differ diff --git a/hw2/images/img3.jpg b/hw2/images/img3.jpg new file mode 100644 index 0000000..c93c783 Binary files /dev/null and b/hw2/images/img3.jpg differ diff --git a/hw4/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/hw4/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000..dff6604 --- /dev/null +++ b/hw4/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,193 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "usage: ipykernel_launcher.py [-h] grammar_1 sentences_1\n", + "ipykernel_launcher.py: error: the following arguments are required: sentences_1\n" + ] + }, + { + "ename": "SystemExit", + "evalue": "2", + "output_type": "error", + "traceback": [ + "An exception has occurred, use %tb to see the full traceback.\n", + "\u001b[1;31mSystemExit\u001b[0m\u001b[1;31m:\u001b[0m 2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Program Files (x86)\\Microsoft Visual Studio\\Shared\\Anaconda3_64\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2971: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.\n", + " warn(\"To exit: use 'exit', 'quit', or Ctrl-D.\", stacklevel=1)\n" + ] + } + ], + "source": [ + "# Chandni Patel\n", + "# CS 585 - NLP\n", + "# Homework 5\n", + "\n", + "\"\"\"Read grammar definition and sentence to parse,\n", + "then output a valid parse for the sentence, given the grammar.\n", + "\n", + "Implementation of Cocke-Younger-Kasami parsing\n", + "\"\"\"\n", + "\n", + "import argparse\n", + "from collections import namedtuple\n", + "import re\n", + "\n", + "# Data structures for rules\n", + "# Nonterminal rules have one symbol on left-hand side, two symbols on right-hand side\n", + "NonterminalRule = namedtuple(\"NonterminalRule\", [\"lhs\", \"rhs1\", \"rhs2\"])\n", + "# Terminal rules have one symbol on left-hand side, one symbol on right-hand side\n", + "TerminalRule = namedtuple(\"TerminalRule\", [\"lhs\", \"rhs\"])\n", + "\n", + "# Data structure for parsed phrase\n", + "ParsedPhrase = namedtuple(\"ParsedPhrase\", [\"label\", \"children\"])\n", + "\n", + "def parse_rules(infile):\n", + " \"\"\"Parse grammar file with phrase structure rules, and\n", + " return a tuple (nt, t), where nt is a list of nonterminal\n", + " rules and t is a list of terminal rules\n", + " \"\"\"\n", + " nt = []\n", + " t = []\n", + " ntmatcher = re.compile(r\"^\\s*(\\w+)\\s+->\\s+(\\w+)\\s+(\\w+)\\s*$\")\n", + " tmatcher = re.compile(r\"^\\s*(\\w+)\\s+->\\s+(\\w+)\\s*$\")\n", + " with open(infile) as input_text:\n", + " for line in input_text:\n", + " found = ntmatcher.search(line)\n", + " if found:\n", + " nt.append(NonterminalRule(*found.group(1, 2, 3)))\n", + " else:\n", + " found = tmatcher.search(line)\n", + " if found:\n", + " t.append(TerminalRule(*found.group(1, 2)))\n", + " return nt, t\n", + "\n", + "\n", + "def read_sentences(infile):\n", + " \"\"\"Read a file with one sentence per line, and return\n", + " a list of word lists (one for each sentence)\n", + " \"\"\"\n", + " with open(infile) as input_text:\n", + " return [line.strip().split() for line in input_text if line]\n", + "\n", + "\n", + "def parse_sentence(nt_rules, t_rules, words):\n", + " \"\"\"Parse a sentence with the CYK algorithm\n", + "\n", + " :param nt_rules: List of nonterminal rules in grammar\n", + " :param t_rules: List of terminal rules in grammar\n", + " :param words: sequence (list) of words in sentence to parse\n", + " :return: Recursively-nested NonterminalRule object representing parse tree\n", + " (or None if parsing fails)\n", + " \"\"\"\n", + " # NOTE -- you can change this data structure / function if you prefer to do\n", + " # this differently, but the function still needs to return\n", + " # - a parse represented as recursively nested NonterminalRule / TerminalRule objects\n", + " # - or None if the sentence cannot be parsed\n", + "\n", + " # chart[m][n][symb] will contain a ParsedPhrase object for a phrase\n", + " # - of length m+1\n", + " # - starting at word n\n", + " # - of phrase category symb\n", + " chart = [[{} for j in range(len(words))] for i in range(len(words))]\n", + "\n", + " # Initialize terminals in chart\n", + " for i, word in enumerate(words):\n", + " for tr in t_rules:\n", + " if word == tr.rhs:\n", + " chart[0][i][tr.lhs] = ParsedPhrase(label=tr.lhs, children=[word])\n", + "\n", + " # Work up the chart\n", + " # TODO\n", + " # Implementing Cocke-Younger-Kasami parsing algorithm\n", + " # for m := 1 to Nw-1 do:\n", + " for m in range(1, len(words)):\n", + " # for n := 0 to Nw-m-1 do:\n", + " for n in range(len(words)-m):\n", + " # chart[m, n] := {}\n", + " chart[m][n] = {}\n", + " # for k := n+1 to n+m do:\n", + " for k in range(n+1, n+m+1):\n", + " # for every_rule A → BC do:\n", + " for A in nt_rules:\n", + " # if B ∈ chart[k-n-1, n] and C ∈ chart[n+m-k, k] then:\n", + " if (A.rhs1 in chart[k-n-1][n]) and (A.rhs2 in chart[n+m-k][k]):\n", + " # chart[m, n] := chart[m, n] ∪ {A}\n", + " chart[m][n][A.lhs] = ParsedPhrase(label=A.lhs, \n", + " children=[chart[k-n-1][n].get(A.rhs1), chart[n+m-k][k].get(A.rhs2)])\n", + " # END TODO\n", + "\n", + " # All valid sentence parses should have the label \"S\"\n", + " return chart[len(words)-1][0].get(\"S\")\n", + "\n", + "\n", + "def parse_to_string(parse):\n", + " \"\"\"Return a string representation of a parsed phrase object\n", + " \"\"\"\n", + " if len(parse.children) > 1:\n", + " return f\"({parse.label} {parse_to_string(parse.children[0])} {parse_to_string(parse.children[1])})\"\n", + " return f\"({parse.label} {parse.children[0]})\"\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " \"\"\"Do not edit this code\n", + " \"\"\"\n", + " parser = argparse.ArgumentParser()\n", + " parser.add_argument(\"grammar_1\", default=\"grammar_1.txt\")\n", + " parser.add_argument(\"sentences_1\", default=\"sentences_1.txt\")\n", + " args = parser.parse_args()\n", + " \n", + " nt_rules, t_rules = parse_rules(args.grammar_file)\n", + " word_sequences = read_sentences(args.sentence_file)\n", + "\n", + " for s in word_sequences:\n", + " parse = parse_sentence(nt_rules, t_rules, s)\n", + " if parse:\n", + " print(parse_to_string(parse))\n", + " else:\n", + " print(\"Parsing failed\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/hw4/Untitled.ipynb b/hw4/Untitled.ipynb new file mode 100644 index 0000000..9ba5244 --- /dev/null +++ b/hw4/Untitled.ipynb @@ -0,0 +1,193 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "usage: ipykernel_launcher.py [-h] grammar_file sentence_file\n", + "ipykernel_launcher.py: error: the following arguments are required: sentence_file\n" + ] + }, + { + "ename": "SystemExit", + "evalue": "2", + "output_type": "error", + "traceback": [ + "An exception has occurred, use %tb to see the full traceback.\n", + "\u001b[1;31mSystemExit\u001b[0m\u001b[1;31m:\u001b[0m 2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Program Files (x86)\\Microsoft Visual Studio\\Shared\\Anaconda3_64\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2971: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.\n", + " warn(\"To exit: use 'exit', 'quit', or Ctrl-D.\", stacklevel=1)\n" + ] + } + ], + "source": [ + "# Chandni Patel\n", + "# CS 585 - NLP\n", + "# Homework 4\n", + "\n", + "\"\"\"Read grammar definition and sentence to parse,\n", + "then output a valid parse for the sentence, given the grammar.\n", + "\n", + "Implementation of Cocke-Younger-Kasami parsing\n", + "\"\"\"\n", + "\n", + "import argparse\n", + "from collections import namedtuple\n", + "import re\n", + "\n", + "# Data structures for rules\n", + "# Nonterminal rules have one symbol on left-hand side, two symbols on right-hand side\n", + "NonterminalRule = namedtuple(\"NonterminalRule\", [\"lhs\", \"rhs1\", \"rhs2\"])\n", + "# Terminal rules have one symbol on left-hand side, one symbol on right-hand side\n", + "TerminalRule = namedtuple(\"TerminalRule\", [\"lhs\", \"rhs\"])\n", + "\n", + "# Data structure for parsed phrase\n", + "ParsedPhrase = namedtuple(\"ParsedPhrase\", [\"label\", \"children\"])\n", + "\n", + "def parse_rules(infile):\n", + " \"\"\"Parse grammar file with phrase structure rules, and\n", + " return a tuple (nt, t), where nt is a list of nonterminal\n", + " rules and t is a list of terminal rules\n", + " \"\"\"\n", + " nt = []\n", + " t = []\n", + " ntmatcher = re.compile(r\"^\\s*(\\w+)\\s+->\\s+(\\w+)\\s+(\\w+)\\s*$\")\n", + " tmatcher = re.compile(r\"^\\s*(\\w+)\\s+->\\s+(\\w+)\\s*$\")\n", + " with open(infile) as input_text:\n", + " for line in input_text:\n", + " found = ntmatcher.search(line)\n", + " if found:\n", + " nt.append(NonterminalRule(*found.group(1, 2, 3)))\n", + " else:\n", + " found = tmatcher.search(line)\n", + " if found:\n", + " t.append(TerminalRule(*found.group(1, 2)))\n", + " return nt, t\n", + "\n", + "\n", + "def read_sentences(infile):\n", + " \"\"\"Read a file with one sentence per line, and return\n", + " a list of word lists (one for each sentence)\n", + " \"\"\"\n", + " with open(infile) as input_text:\n", + " return [line.strip().split() for line in input_text if line]\n", + "\n", + "\n", + "def parse_sentence(nt_rules, t_rules, words):\n", + " \"\"\"Parse a sentence with the CYK algorithm\n", + "\n", + " :param nt_rules: List of nonterminal rules in grammar\n", + " :param t_rules: List of terminal rules in grammar\n", + " :param words: sequence (list) of words in sentence to parse\n", + " :return: Recursively-nested NonterminalRule object representing parse tree\n", + " (or None if parsing fails)\n", + " \"\"\"\n", + " # NOTE -- you can change this data structure / function if you prefer to do\n", + " # this differently, but the function still needs to return\n", + " # - a parse represented as recursively nested NonterminalRule / TerminalRule objects\n", + " # - or None if the sentence cannot be parsed\n", + "\n", + " # chart[m][n][symb] will contain a ParsedPhrase object for a phrase\n", + " # - of length m+1\n", + " # - starting at word n\n", + " # - of phrase category symb\n", + " chart = [[{} for j in range(len(words))] for i in range(len(words))]\n", + "\n", + " # Initialize terminals in chart\n", + " for i, word in enumerate(words):\n", + " for tr in t_rules:\n", + " if word == tr.rhs:\n", + " chart[0][i][tr.lhs] = ParsedPhrase(label=tr.lhs, children=[word])\n", + "\n", + " # Work up the chart\n", + " # TODO\n", + " # Implementing Cocke-Younger-Kasami parsing algorithm\n", + " # for m := 1 to Nw-1 do:\n", + " for m in range(1, len(words)):\n", + " # for n := 0 to Nw-m-1 do:\n", + " for n in range(len(words)-m):\n", + " # chart[m, n] := {}\n", + " chart[m][n] = {}\n", + " # for k := n+1 to n+m do:\n", + " for k in range(n+1, n+m+1):\n", + " # for every_rule A → BC do:\n", + " for A in nt_rules:\n", + " # if B ∈ chart[k-n-1, n] and C ∈ chart[n+m-k, k] then:\n", + " if (A.rhs1 in chart[k-n-1][n]) and (A.rhs2 in chart[n+m-k][k]):\n", + " # chart[m, n] := chart[m, n] ∪ {A}\n", + " chart[m][n][A.lhs] = ParsedPhrase(label=A.lhs, \n", + " children=[chart[k-n-1][n].get(A.rhs1), chart[n+m-k][k].get(A.rhs2)])\n", + " # END TODO\n", + "\n", + " # All valid sentence parses should have the label \"S\"\n", + " return chart[len(words)-1][0].get(\"S\")\n", + "\n", + "\n", + "def parse_to_string(parse):\n", + " \"\"\"Return a string representation of a parsed phrase object\n", + " \"\"\"\n", + " if len(parse.children) > 1:\n", + " return f\"({parse.label} {parse_to_string(parse.children[0])} {parse_to_string(parse.children[1])})\"\n", + " return f\"({parse.label} {parse.children[0]})\"\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " \"\"\"Do not edit this code\n", + " \"\"\"\n", + " parser = argparse.ArgumentParser()\n", + " parser.add_argument(\"grammar_file\", default=\"grammar_1.txt\")\n", + " parser.add_argument(\"sentence_file\", default=\"sentences_1.txt\")\n", + " args = parser.parse_args()\n", + "\n", + " nt_rules, t_rules = parse_rules(args.grammar_file)\n", + " word_sequences = read_sentences(args.sentence_file)\n", + "\n", + " for s in word_sequences:\n", + " parse = parse_sentence(nt_rules, t_rules, s)\n", + " if parse:\n", + " print(parse_to_string(parse))\n", + " else:\n", + " print(\"Parsing failed\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/hw4/cyk.py b/hw4/cyk.py new file mode 100644 index 0000000..dda44e6 --- /dev/null +++ b/hw4/cyk.py @@ -0,0 +1,127 @@ +# Chandni Patel +# CS 585 - NLP +# Homework 4 + +"""Read grammar definition and sentence to parse, +then output a valid parse for the sentence, given the grammar. + +Implementation of Cocke-Younger-Kasami parsing +""" + +import argparse +from collections import namedtuple +import re + +# Data structures for rules +# Nonterminal rules have one symbol on left-hand side, two symbols on right-hand side +NonterminalRule = namedtuple("NonterminalRule", ["lhs", "rhs1", "rhs2"]) +# Terminal rules have one symbol on left-hand side, one symbol on right-hand side +TerminalRule = namedtuple("TerminalRule", ["lhs", "rhs"]) + +# Data structure for parsed phrase +ParsedPhrase = namedtuple("ParsedPhrase", ["label", "children"]) + +def parse_rules(infile): + """Parse grammar file with phrase structure rules, and + return a tuple (nt, t), where nt is a list of nonterminal + rules and t is a list of terminal rules + """ + nt = [] + t = [] + ntmatcher = re.compile(r"^\s*(\w+)\s+->\s+(\w+)\s+(\w+)\s*$") + tmatcher = re.compile(r"^\s*(\w+)\s+->\s+(\w+)\s*$") + with open(infile) as input_text: + for line in input_text: + found = ntmatcher.search(line) + if found: + nt.append(NonterminalRule(*found.group(1, 2, 3))) + else: + found = tmatcher.search(line) + if found: + t.append(TerminalRule(*found.group(1, 2))) + return nt, t + + +def read_sentences(infile): + """Read a file with one sentence per line, and return + a list of word lists (one for each sentence) + """ + with open(infile) as input_text: + return [line.strip().split() for line in input_text if line] + + +def parse_sentence(nt_rules, t_rules, words): + """Parse a sentence with the CYK algorithm + + :param nt_rules: List of nonterminal rules in grammar + :param t_rules: List of terminal rules in grammar + :param words: sequence (list) of words in sentence to parse + :return: Recursively-nested NonterminalRule object representing parse tree + (or None if parsing fails) + """ + # NOTE -- you can change this data structure / function if you prefer to do + # this differently, but the function still needs to return + # - a parse represented as recursively nested NonterminalRule / TerminalRule objects + # - or None if the sentence cannot be parsed + + # chart[m][n][symb] will contain a ParsedPhrase object for a phrase + # - of length m+1 + # - starting at word n + # - of phrase category symb + chart = [[{} for j in range(len(words))] for i in range(len(words))] + + # Initialize terminals in chart + for i, word in enumerate(words): + for tr in t_rules: + if word == tr.rhs: + chart[0][i][tr.lhs] = ParsedPhrase(label=tr.lhs, children=[word]) + + # Work up the chart + # TODO + # Implementing Cocke-Younger-Kasami parsing algorithm + # for m := 1 to Nw-1 do: + for m in range(1, len(words)): + # for n := 0 to Nw-m-1 do: + for n in range(len(words)-m): + # chart[m, n] := {} + chart[m][n] = {} + # for k := n+1 to n+m do: + for k in range(n+1, n+m+1): + # for every_rule A → BC do: + for A in nt_rules: + # if B ∈ chart[k-n-1, n] and C ∈ chart[n+m-k, k] then: + if (A.rhs1 in chart[k-n-1][n]) and (A.rhs2 in chart[n+m-k][k]): + # chart[m, n] := chart[m, n] ∪ {A} + chart[m][n][A.lhs] = ParsedPhrase(label=A.lhs, + children=[chart[k-n-1][n].get(A.rhs1), chart[n+m-k][k].get(A.rhs2)]) + # END TODO + + # All valid sentence parses should have the label "S" + return chart[len(words)-1][0].get("S") + + +def parse_to_string(parse): + """Return a string representation of a parsed phrase object + """ + if len(parse.children) > 1: + return f"({parse.label} {parse_to_string(parse.children[0])} {parse_to_string(parse.children[1])})" + return f"({parse.label} {parse.children[0]})" + + +if __name__ == "__main__": + """Do not edit this code + """ + parser = argparse.ArgumentParser() + parser.add_argument("--grammar_file", default="grammar_1.txt") + parser.add_argument("--sentence_file", default="sentences_1.txt") + args = parser.parse_args() + + nt_rules, t_rules = parse_rules(args.grammar_file) + word_sequences = read_sentences(args.sentence_file) + + for s in word_sequences: + parse = parse_sentence(nt_rules, t_rules, s) + if parse: + print(parse_to_string(parse)) + else: + print("Parsing failed") diff --git a/hw4/grammar_1.txt b/hw4/grammar_1.txt new file mode 100644 index 0000000..238774d --- /dev/null +++ b/hw4/grammar_1.txt @@ -0,0 +1,15 @@ +# non-terminal rules +S -> NP VP +VP -> VP PP +VP -> V NP +PP -> P NP +NP -> DT N + +# terminal rules +NP -> she +VP -> sees +V -> sees +P -> with +N -> man +N -> telescope +DT -> a diff --git a/hw4/grammar_2.txt b/hw4/grammar_2.txt new file mode 100644 index 0000000..bf6d533 --- /dev/null +++ b/hw4/grammar_2.txt @@ -0,0 +1,20 @@ +# non-terminal rules +S -> NP VP +VP -> VP CVP +CVP -> CC VP +VP -> V NP +NP -> NP CNP +CNP -> CC NP +NP -> DT N + +# terminal rules +NP -> they +VP -> buy +V -> buy +VP -> duck +NP -> chicken +NP -> duck +N -> chicken +N -> duck +DT -> a +CC -> and diff --git a/hw4/images/img1.jpg b/hw4/images/img1.jpg new file mode 100644 index 0000000..6e3f29a Binary files /dev/null and b/hw4/images/img1.jpg differ diff --git a/hw4/sentences_1.txt b/hw4/sentences_1.txt new file mode 100644 index 0000000..058c11a --- /dev/null +++ b/hw4/sentences_1.txt @@ -0,0 +1,4 @@ +she sees a man with a telescope +a man sees a telescope +man sees telescope +she sees she diff --git a/hw4/sentences_2.txt b/hw4/sentences_2.txt new file mode 100644 index 0000000..8030fb0 --- /dev/null +++ b/hw4/sentences_2.txt @@ -0,0 +1,4 @@ +they buy a chicken and duck +they buy a duck and a chicken +a duck and a chicken +chicken buy