Skip to content

Commit

Permalink
HW
Browse files Browse the repository at this point in the history
  • Loading branch information
patelc52 committed Dec 27, 2020
1 parent 9d820f8 commit 218a145
Show file tree
Hide file tree
Showing 34 changed files with 948 additions and 0 deletions.
6 changes: 6 additions & 0 deletions hw1/.ipynb_checkpoints/Untitled-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}
169 changes: 169 additions & 0 deletions hw1/Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.672\n",
"2.619\n",
"2.791\n",
"2.746\n",
"0.119\n",
"0.127\n",
"0.030\n",
"0.030\n"
]
}
],
"source": [
"\"\"\"Calculate information-theoretic measures of distributional\n",
"similarity based on word frequencies in two texts\n",
"\"\"\"\n",
"\n",
"import collections\n",
"import math\n",
"\n",
"\n",
"def read_words(infile):\n",
" with open(infile) as input_text:\n",
" return [x.strip() for x in input_text.read().split()]\n",
"\n",
"\n",
"def get_counts(word_list):\n",
" return collections.Counter(word_list)\n",
"\n",
"\n",
"def create_prob_dist(count_dict):\n",
" total_ct = sum(count_dict.values())\n",
" p = {x: ct / total_ct for x, ct in count_dict.items()}\n",
" return p\n",
"\n",
"\n",
"def count_smoothing(freq_dist, vocabulary, alpha=1):\n",
" \"\"\"Implement simple count-based probability smoothing.\n",
" Given a target vocabulary and a set of observed count frequencies,\n",
" calculate a new set of counts so that Count(x) > 0 for all words\n",
" in the target vocabulary. This is achieved by adding `alpha`\n",
" to each observed count\n",
" \"\"\"\n",
" return {w: freq_dist.get(w, 0) + alpha for w in vocabulary}\n",
"\n",
"\n",
"def entropy(p):\n",
" \"\"\"Calculate entropy H(p) for a probability distribution represented\n",
" as a mapping (dictionary) from word tokens to probabilities\n",
" \"\"\"\n",
" h = 0\n",
"\n",
" # TODO -- Calculate entropy value in nats for probability distribution `p`\n",
" for x in p:\n",
" h -= p[x] * math.log(p[x])\n",
"\n",
" return h\n",
"\n",
"\n",
"def cross_entropy(p1, p2):\n",
" \"\"\"Calculate cross-entropy H(p1, p2) for two probability distributions\n",
" represented as a mapping (dictionary) from word tokens to\n",
" probabilities\n",
" \"\"\"\n",
" xh = 0\n",
"\n",
" # TODO -- Calculate cross-entropy value H(p1, p2) in nats\n",
" for x in p1:\n",
" xh -= p1[x] * math.log(p2[x])\n",
"\n",
" return xh\n",
"\n",
"\n",
"def kl_divergence(p1, p2):\n",
" \"\"\"Calculate Kullback-Leibler divergence D_{KL}(p1||p2) for two\n",
" probability distributions represented as a mapping (dictionary)\n",
" from word tokens to probabilities\n",
" \"\"\"\n",
" kl = 0\n",
"\n",
" # TODO -- Calculate KL divergence D_{KL}(p1||p2) in nats\n",
" kl = cross_entropy(p1, p2) - entropy(p1)\n",
"\n",
" return kl\n",
"\n",
"\n",
"def js_divergence(p1, p2):\n",
" \"\"\"Calculate Jensen-Shannon divergence D_{JS}(p1||p2) for two\n",
" probability distributions represented as a mapping (dictionary)\n",
" from word tokens to probabilities\n",
" \"\"\"\n",
" js = 0\n",
"\n",
" # TODO -- Calculate JS divergence D_{JS}(p1||p2) in nats\n",
" m = {k: ((p1.get(k, 0) + p2.get(k, 0))/2.0) for k in p1.keys()} \n",
" js = (kl_divergence(p1, m) + kl_divergence(p2, m))/2\n",
" \n",
" return js\n",
"\n",
"if __name__ == \"__main__\":\n",
" \"\"\"Do not edit this code\n",
" \"\"\"\n",
" words_a = read_words(\"test_a.txt\")\n",
" words_b = read_words(\"test_b.txt\")\n",
"\n",
" ct_a = get_counts(words_a)\n",
" ct_b = get_counts(words_b)\n",
"\n",
" vocab = set(ct_a.keys()) | set(ct_b.keys())\n",
" ct_a = count_smoothing(ct_a, vocab)\n",
" ct_b = count_smoothing(ct_b, vocab)\n",
"\n",
" p_a = create_prob_dist(ct_a)\n",
" p_b = create_prob_dist(ct_b)\n",
"\n",
" h_a = entropy(p_a)\n",
" h_b = entropy(p_b)\n",
" xh_ab = cross_entropy(p_a, p_b)\n",
" xh_ba = cross_entropy(p_b, p_a)\n",
" kl_ab = kl_divergence(p_a, p_b)\n",
" kl_ba = kl_divergence(p_b, p_a)\n",
" js_ab = js_divergence(p_a, p_b)\n",
" js_ba = js_divergence(p_b, p_a)\n",
"\n",
" for metric in [h_a, h_b, xh_ab, xh_ba,\n",
" kl_ab, kl_ba, js_ab, js_ba]:\n",
" print(\"{:.3f}\".format(metric))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
121 changes: 121 additions & 0 deletions hw1/dist_measures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Chandni Patel
# CS 585 - NLP
# Homework 1

"""Calculate information-theoretic measures of distributional
similarity based on word frequencies in two texts
"""

import collections
import math


def read_words(infile):
with open(infile) as input_text:
return [x.strip() for x in input_text.read().split()]


def get_counts(word_list):
return collections.Counter(word_list)


def create_prob_dist(count_dict):
total_ct = sum(count_dict.values())
p = {x: ct / total_ct for x, ct in count_dict.items()}
return p


def count_smoothing(freq_dist, vocabulary, alpha=1):
"""Implement simple count-based probability smoothing.
Given a target vocabulary and a set of observed count frequencies,
calculate a new set of counts so that Count(x) > 0 for all words
in the target vocabulary. This is achieved by adding `alpha`
to each observed count
"""
return {w: freq_dist.get(w, 0) + alpha for w in vocabulary}


def entropy(p):
"""Calculate entropy H(p) for a probability distribution represented
as a mapping (dictionary) from word tokens to probabilities
"""
h = 0

# TODO -- Calculate entropy value in nats for probability distribution `p`
for x in p:
h -= p[x] * math.log(p[x])

return h


def cross_entropy(p1, p2):
"""Calculate cross-entropy H(p1, p2) for two probability distributions
represented as a mapping (dictionary) from word tokens to
probabilities
"""
xh = 0

# TODO -- Calculate cross-entropy value H(p1, p2) in nats
for x in p1:
xh -= p1[x] * math.log(p2[x])

return xh


def kl_divergence(p1, p2):
"""Calculate Kullback-Leibler divergence D_{KL}(p1||p2) for two
probability distributions represented as a mapping (dictionary)
from word tokens to probabilities
"""
kl = 0

# TODO -- Calculate KL divergence D_{KL}(p1||p2) in nats
# D_{KL}(p1||p2) = H(p1,p2) - H(p1)
kl = cross_entropy(p1, p2) - entropy(p1)

return kl


def js_divergence(p1, p2):
"""Calculate Jensen-Shannon divergence D_{JS}(p1||p2) for two
probability distributions represented as a mapping (dictionary)
from word tokens to probabilities
"""
js = 0

# TODO -- Calculate JS divergence D_{JS}(p1||p2) in nats
# M = (p1+p2)/2
m = {k: ((p1.get(k, 0) + p2.get(k, 0))/2.0) for k in p1.keys()}
# D_{JS}(p1||p2) = (D_{KL}(p1||M) + D_{KL}(p2||M))/2
js = (kl_divergence(p1, m) + kl_divergence(p2, m))/2

return js

if __name__ == "__main__":
"""Do not edit this code
"""
words_a = read_words("test_a.txt")
words_b = read_words("test_b.txt")

ct_a = get_counts(words_a)
ct_b = get_counts(words_b)

vocab = set(ct_a.keys()) | set(ct_b.keys())
ct_a = count_smoothing(ct_a, vocab)
ct_b = count_smoothing(ct_b, vocab)

p_a = create_prob_dist(ct_a)
p_b = create_prob_dist(ct_b)

h_a = entropy(p_a)
h_b = entropy(p_b)
xh_ab = cross_entropy(p_a, p_b)
xh_ba = cross_entropy(p_b, p_a)
kl_ab = kl_divergence(p_a, p_b)
kl_ba = kl_divergence(p_b, p_a)
js_ab = js_divergence(p_a, p_b)
js_ba = js_divergence(p_b, p_a)

for metric in [h_a, h_b, xh_ab, xh_ba,
kl_ab, kl_ba, js_ab, js_ba]:
print("{:.3f}".format(metric))
Binary file added hw1/images/img1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added hw1/images/img2.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added hw1/images/img3.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added hw1/images/img4.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 8 additions & 0 deletions hw1/test.out.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
2.672
2.619
2.791
2.746
0.119
0.127
0.030
0.030
1 change: 1 addition & 0 deletions hw1/test_a.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
the quick brown fox jumped over the lazy dog the spry red dog jumped near the indolent fox the lazy cat never jumped
1 change: 1 addition & 0 deletions hw1/test_b.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
the dog and the cat jumped over the fox the red and brown fox jumped over the mouse
9 changes: 9 additions & 0 deletions hw2/hw2a-handout/gen_vocab.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

# Replace this line with a sequence of shell commands connected with Unix pipes ("|")

# replace spaces with new line to get list of words
# then sort all words and get unique works
# remove empty lines

tr ' ' '\n' | sort | uniq | sed -r '/^\s*$/d'
3 changes: 3 additions & 0 deletions hw2/hw2a-handout/test.in.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
the quick brown fox jumped over the lazy dog the spry red dog jumped near the indolent fox the lazy cat never jumped
the dog and the cat jumped over the fox the red and brown fox jumped over the mouse
quick brown foxes jumped and lazy dogs slept
19 changes: 19 additions & 0 deletions hw2/hw2a-handout/test.out.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
and
brown
cat
dog
dogs
fox
foxes
indolent
jumped
lazy
mouse
near
never
over
quick
red
slept
spry
the
22 changes: 22 additions & 0 deletions hw2/hw2b-handout/compare_cols.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

# Replace this line with one or more shell commands
# You may write to intermediate text files on disk if necessary

# cut and paste columns for comparing
cut -d ',' -f 3,5 > col35.txt
cut -d ',' -f 1 < col35.txt | awk '{print $1}' > col3.txt
cut -d ',' -f 2 < col35.txt > col5.txt
paste -d ' ' col3.txt col5.txt > col35.txt
# compare and add lines
awk '{for(i=2;i<=NF;i++)
{
if($1 == $i)
{
print $1 > "lines.txt";
break;}
}
}' col35.txt
wc -l lines.txt | tr ' ' '\n' | head -n 1


4 changes: 4 additions & 0 deletions hw2/hw2b-handout/test.in.1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
the quick brown,fox jumped over the lazy,dog the,spry,red dog jumped,near the indolent fox,the lazy cat never jumped
the dog,and the,cat jumped,over the,fox the red and brown,fox jumped,over the mouse
a brown,fox and,a red,dog jumped,over the,mouse and,the cat
the cat,slept and,the dog,jumped over,the mouse,while the fox,slept
2 changes: 2 additions & 0 deletions hw2/hw2b-handout/test.in.2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
the quick brown,fox jumped over the lazy,dog the,spry,red dog jumped,near the indolent fox,the lazy cat never jumped
the dog,and the,cat jumped,over the,fox the red and brown,fox jumped,over the mouse
1 change: 1 addition & 0 deletions hw2/hw2b-handout/test.out.1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2
1 change: 1 addition & 0 deletions hw2/hw2b-handout/test.out.2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1
16 changes: 16 additions & 0 deletions hw2/hw2c-handout/count_articles.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

# Replace this line with one or more shell commands
# You may write to intermediate text files on disk if necessary

ls test_*.txt | for file in $(ls test_*.txt)
do
#The second field should be the number of lines in the file that include "the" as a word.
numthe=$(cat $file | grep -c -E '\b[Tt]he\b')
#The third field should be the number of lines in the file that include "a" as a word.
numa=$(cat $file | grep -c -E '\b[Aa]\b')
#The fourth field should be the number of lines in the file that include "an" as a word.
numan=$(cat $file | grep -c -E '\b[Aa]n\b')
echo "$file,$numthe,$numa,$numan"
done

Loading

0 comments on commit 218a145

Please sign in to comment.