HW

chandnii7 · Dec 27, 2020 · 218a145 · 218a145
1 parent 9d820f8
commit 218a145
Show file tree

Hide file tree

Showing 34 changed files with 948 additions and 0 deletions.
diff --git a/hw1/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/hw1/.ipynb_checkpoints/Untitled-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/hw1/Untitled.ipynb b/hw1/Untitled.ipynb
@@ -0,0 +1,169 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2.672\n",
+      "2.619\n",
+      "2.791\n",
+      "2.746\n",
+      "0.119\n",
+      "0.127\n",
+      "0.030\n",
+      "0.030\n"
+     ]
+    }
+   ],
+   "source": [
+    "\"\"\"Calculate information-theoretic measures of distributional\n",
+    "similarity based on word frequencies in two texts\n",
+    "\"\"\"\n",
+    "\n",
+    "import collections\n",
+    "import math\n",
+    "\n",
+    "\n",
+    "def read_words(infile):\n",
+    "    with open(infile) as input_text:\n",
+    "        return [x.strip() for x in input_text.read().split()]\n",
+    "\n",
+    "\n",
+    "def get_counts(word_list):\n",
+    "    return collections.Counter(word_list)\n",
+    "\n",
+    "\n",
+    "def create_prob_dist(count_dict):\n",
+    "    total_ct = sum(count_dict.values())\n",
+    "    p = {x: ct / total_ct for x, ct in count_dict.items()}\n",
+    "    return p\n",
+    "\n",
+    "\n",
+    "def count_smoothing(freq_dist, vocabulary, alpha=1):\n",
+    "    \"\"\"Implement simple count-based probability smoothing.\n",
+    "    Given a target vocabulary and a set of observed count frequencies,\n",
+    "    calculate a new set of counts so that Count(x) > 0 for all words\n",
+    "    in the target vocabulary.  This is achieved by adding `alpha`\n",
+    "    to each observed count\n",
+    "    \"\"\"\n",
+    "    return {w: freq_dist.get(w, 0) + alpha for w in vocabulary}\n",
+    "\n",
+    "\n",
+    "def entropy(p):\n",
+    "    \"\"\"Calculate entropy H(p) for a probability distribution represented\n",
+    "    as a mapping (dictionary) from word tokens to probabilities\n",
+    "    \"\"\"\n",
+    "    h = 0\n",
+    "\n",
+    "    # TODO -- Calculate entropy value in nats for probability distribution `p`\n",
+    "    for x in p:\n",
+    "        h -= p[x] * math.log(p[x])\n",
+    "\n",
+    "    return h\n",
+    "\n",
+    "\n",
+    "def cross_entropy(p1, p2):\n",
+    "    \"\"\"Calculate cross-entropy H(p1, p2) for two probability distributions\n",
+    "    represented as a mapping (dictionary) from word tokens to\n",
+    "    probabilities\n",
+    "    \"\"\"\n",
+    "    xh = 0\n",
+    "\n",
+    "    # TODO -- Calculate cross-entropy value H(p1, p2) in nats\n",
+    "    for x in p1:\n",
+    "        xh -= p1[x] * math.log(p2[x])\n",
+    "\n",
+    "    return xh\n",
+    "\n",
+    "\n",
+    "def kl_divergence(p1, p2):\n",
+    "    \"\"\"Calculate Kullback-Leibler divergence D_{KL}(p1||p2) for two\n",
+    "    probability distributions represented as a mapping (dictionary)\n",
+    "    from word tokens to probabilities\n",
+    "    \"\"\"\n",
+    "    kl = 0\n",
+    "\n",
+    "    # TODO -- Calculate KL divergence D_{KL}(p1||p2) in nats\n",
+    "    kl = cross_entropy(p1, p2) - entropy(p1)\n",
+    "\n",
+    "    return kl\n",
+    "\n",
+    "\n",
+    "def js_divergence(p1, p2):\n",
+    "    \"\"\"Calculate Jensen-Shannon divergence D_{JS}(p1||p2) for two\n",
+    "    probability distributions represented as a mapping (dictionary)\n",
+    "    from word tokens to probabilities\n",
+    "    \"\"\"\n",
+    "    js = 0\n",
+    "\n",
+    "    # TODO -- Calculate JS divergence D_{JS}(p1||p2) in nats\n",
+    "    m = {k: ((p1.get(k, 0) + p2.get(k, 0))/2.0) for k in p1.keys()}            \n",
+    "    js = (kl_divergence(p1, m) + kl_divergence(p2, m))/2\n",
+    "    \n",
+    "    return js\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    \"\"\"Do not edit this code\n",
+    "    \"\"\"\n",
+    "    words_a = read_words(\"test_a.txt\")\n",
+    "    words_b = read_words(\"test_b.txt\")\n",
+    "\n",
+    "    ct_a = get_counts(words_a)\n",
+    "    ct_b = get_counts(words_b)\n",
+    "\n",
+    "    vocab = set(ct_a.keys()) | set(ct_b.keys())\n",
+    "    ct_a = count_smoothing(ct_a, vocab)\n",
+    "    ct_b = count_smoothing(ct_b, vocab)\n",
+    "\n",
+    "    p_a = create_prob_dist(ct_a)\n",
+    "    p_b = create_prob_dist(ct_b)\n",
+    "\n",
+    "    h_a = entropy(p_a)\n",
+    "    h_b = entropy(p_b)\n",
+    "    xh_ab = cross_entropy(p_a, p_b)\n",
+    "    xh_ba = cross_entropy(p_b, p_a)\n",
+    "    kl_ab = kl_divergence(p_a, p_b)\n",
+    "    kl_ba = kl_divergence(p_b, p_a)\n",
+    "    js_ab = js_divergence(p_a, p_b)\n",
+    "    js_ba = js_divergence(p_b, p_a)\n",
+    "\n",
+    "    for metric in [h_a, h_b, xh_ab, xh_ba,\n",
+    "                   kl_ab, kl_ba, js_ab, js_ba]:\n",
+    "        print(\"{:.3f}\".format(metric))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/hw1/dist_measures.py b/hw1/dist_measures.py
@@ -0,0 +1,121 @@
+# Chandni Patel
+# CS 585 - NLP
+# Homework 1
+
+"""Calculate information-theoretic measures of distributional
+similarity based on word frequencies in two texts
+"""
+
+import collections
+import math
+
+
+def read_words(infile):
+    with open(infile) as input_text:
+        return [x.strip() for x in input_text.read().split()]
+
+
+def get_counts(word_list):
+    return collections.Counter(word_list)
+
+
+def create_prob_dist(count_dict):
+    total_ct = sum(count_dict.values())
+    p = {x: ct / total_ct for x, ct in count_dict.items()}
+    return p
+
+
+def count_smoothing(freq_dist, vocabulary, alpha=1):
+    """Implement simple count-based probability smoothing.
+    Given a target vocabulary and a set of observed count frequencies,
+    calculate a new set of counts so that Count(x) > 0 for all words
+    in the target vocabulary.  This is achieved by adding `alpha`
+    to each observed count
+    """
+    return {w: freq_dist.get(w, 0) + alpha for w in vocabulary}
+
+
+def entropy(p):
+    """Calculate entropy H(p) for a probability distribution represented
+    as a mapping (dictionary) from word tokens to probabilities
+    """
+    h = 0
+
+    # TODO -- Calculate entropy value in nats for probability distribution `p`
+    for x in p:
+        h -= p[x] * math.log(p[x])
+
+    return h
+
+
+def cross_entropy(p1, p2):
+    """Calculate cross-entropy H(p1, p2) for two probability distributions
+    represented as a mapping (dictionary) from word tokens to
+    probabilities
+    """
+    xh = 0
+
+    # TODO -- Calculate cross-entropy value H(p1, p2) in nats
+    for x in p1:
+        xh -= p1[x] * math.log(p2[x])
+
+    return xh
+
+
+def kl_divergence(p1, p2):
+    """Calculate Kullback-Leibler divergence D_{KL}(p1||p2) for two
+    probability distributions represented as a mapping (dictionary)
+    from word tokens to probabilities
+    """
+    kl = 0
+
+    # TODO -- Calculate KL divergence D_{KL}(p1||p2) in nats
+	# D_{KL}(p1||p2) = H(p1,p2) - H(p1)
+    kl = cross_entropy(p1, p2) - entropy(p1)
+
+    return kl
+
+
+def js_divergence(p1, p2):
+    """Calculate Jensen-Shannon divergence D_{JS}(p1||p2) for two
+    probability distributions represented as a mapping (dictionary)
+    from word tokens to probabilities
+    """
+    js = 0
+
+    # TODO -- Calculate JS divergence D_{JS}(p1||p2) in nats
+	# M = (p1+p2)/2
+    m = {k: ((p1.get(k, 0) + p2.get(k, 0))/2.0) for k in p1.keys()}
+	# D_{JS}(p1||p2) = (D_{KL}(p1||M) + D_{KL}(p2||M))/2
+    js = (kl_divergence(p1, m) + kl_divergence(p2, m))/2 
+
+    return js
+
+if __name__ == "__main__":
+    """Do not edit this code
+    """
+    words_a = read_words("test_a.txt")
+    words_b = read_words("test_b.txt")
+
+    ct_a = get_counts(words_a)
+    ct_b = get_counts(words_b)
+
+    vocab = set(ct_a.keys()) | set(ct_b.keys())
+    ct_a = count_smoothing(ct_a, vocab)
+    ct_b = count_smoothing(ct_b, vocab)
+
+    p_a = create_prob_dist(ct_a)
+    p_b = create_prob_dist(ct_b)
+
+    h_a = entropy(p_a)
+    h_b = entropy(p_b)
+    xh_ab = cross_entropy(p_a, p_b)
+    xh_ba = cross_entropy(p_b, p_a)
+    kl_ab = kl_divergence(p_a, p_b)
+    kl_ba = kl_divergence(p_b, p_a)
+    js_ab = js_divergence(p_a, p_b)
+    js_ba = js_divergence(p_b, p_a)
+
+    for metric in [h_a, h_b, xh_ab, xh_ba,
+                   kl_ab, kl_ba, js_ab, js_ba]:
+        print("{:.3f}".format(metric))
diff --git a/hw1/images/img1.jpg b/hw1/images/img1.jpg
diff --git a/hw1/images/img2.jpg b/hw1/images/img2.jpg
diff --git a/hw1/images/img3.jpg b/hw1/images/img3.jpg
diff --git a/hw1/images/img4.jpg b/hw1/images/img4.jpg
diff --git a/hw1/test.out.txt b/hw1/test.out.txt
@@ -0,0 +1,8 @@
+2.672
+2.619
+2.791
+2.746
+0.119
+0.127
+0.030
+0.030
diff --git a/hw1/test_a.txt b/hw1/test_a.txt
@@ -0,0 +1 @@
+the quick brown fox jumped over the lazy dog the spry red dog jumped near the indolent fox the lazy cat never jumped
diff --git a/hw1/test_b.txt b/hw1/test_b.txt
@@ -0,0 +1 @@
+the dog and the cat jumped over the fox the red and brown fox jumped over the mouse
diff --git a/hw2/hw2a-handout/gen_vocab.sh b/hw2/hw2a-handout/gen_vocab.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Replace this line with a sequence of shell commands connected with Unix pipes ("|")
+
+# replace spaces with new line to get list of words
+# then sort all words and get unique works
+# remove empty lines
+
+tr ' ' '\n' | sort | uniq | sed -r '/^\s*$/d' 
diff --git a/hw2/hw2a-handout/test.in.txt b/hw2/hw2a-handout/test.in.txt
@@ -0,0 +1,3 @@
+the quick brown fox jumped over the lazy dog the spry red dog jumped near the indolent fox the lazy cat never jumped
+the        dog and the cat jumped over the fox the red and brown fox jumped over the mouse
+ quick brown foxes jumped and lazy dogs slept 
diff --git a/hw2/hw2a-handout/test.out.txt b/hw2/hw2a-handout/test.out.txt
@@ -0,0 +1,19 @@
+and
+brown
+cat
+dog
+dogs
+fox
+foxes
+indolent
+jumped
+lazy
+mouse
+near
+never
+over
+quick
+red
+slept
+spry
+the
diff --git a/hw2/hw2b-handout/compare_cols.sh b/hw2/hw2b-handout/compare_cols.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Replace this line with one or more shell commands
+# You may write to intermediate text files on disk if necessary
+
+# cut and paste columns for comparing
+cut -d ',' -f 3,5 > col35.txt
+cut -d ',' -f 1 < col35.txt | awk '{print $1}' > col3.txt
+cut -d ',' -f 2 < col35.txt > col5.txt
+paste -d ' ' col3.txt col5.txt > col35.txt
+# compare and add lines
+awk '{for(i=2;i<=NF;i++) 
+	{ 
+		if($1 == $i) 
+		{ 
+			print $1 > "lines.txt"; 
+			break;} 
+		} 
+	}' col35.txt
+wc -l lines.txt | tr ' ' '\n' | head -n 1
+
+
diff --git a/hw2/hw2b-handout/test.in.1.txt b/hw2/hw2b-handout/test.in.1.txt
@@ -0,0 +1,4 @@
+the quick brown,fox jumped over the lazy,dog the,spry,red dog jumped,near the indolent fox,the lazy cat never jumped
+the dog,and the,cat jumped,over the,fox the red and brown,fox jumped,over the mouse
+a brown,fox and,a red,dog jumped,over the,mouse and,the cat
+the cat,slept and,the dog,jumped over,the mouse,while the fox,slept
diff --git a/hw2/hw2b-handout/test.in.2.txt b/hw2/hw2b-handout/test.in.2.txt
@@ -0,0 +1,2 @@
+the quick brown,fox jumped over the lazy,dog the,spry,red dog jumped,near the indolent fox,the lazy cat never jumped
+the dog,and the,cat jumped,over the,fox the red and brown,fox jumped,over the mouse
diff --git a/hw2/hw2b-handout/test.out.1.txt b/hw2/hw2b-handout/test.out.1.txt
@@ -0,0 +1 @@
+       2
diff --git a/hw2/hw2b-handout/test.out.2.txt b/hw2/hw2b-handout/test.out.2.txt
@@ -0,0 +1 @@
+       1
diff --git a/hw2/hw2c-handout/count_articles.sh b/hw2/hw2c-handout/count_articles.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Replace this line with one or more shell commands
+# You may write to intermediate text files on disk if necessary
+
+ls test_*.txt | for file in $(ls test_*.txt)
+do
+#The second field should be the number of lines in the file that include "the" as a word.
+numthe=$(cat $file | grep -c -E '\b[Tt]he\b')
+#The third field should be the number of lines in the file that include "a" as a word.
+numa=$(cat $file | grep -c -E '\b[Aa]\b')
+#The fourth field should be the number of lines in the file that include "an" as a word.
+numan=$(cat $file | grep -c -E '\b[Aa]n\b')
+echo "$file,$numthe,$numa,$numan"
+done
+
-Original file line number
+Diff line change
@@ -0,0 +1,8 @@
+.672
+.619
+.791
+.746
+.119
+.127
+.030
+.030
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		the quick brown fox jumped over the lazy dog the spry red dog jumped near the indolent fox the lazy cat never jumped
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		the dog and the cat jumped over the fox the red and brown fox jumped over the mouse
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		the quick brown,fox jumped over the lazy,dog the,spry,red dog jumped,near the indolent fox,the lazy cat never jumped
		the dog,and the,cat jumped,over the,fox the red and brown,fox jumped,over the mouse