-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
34 changed files
with
948 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"cells": [], | ||
"metadata": {}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 14, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"2.672\n", | ||
"2.619\n", | ||
"2.791\n", | ||
"2.746\n", | ||
"0.119\n", | ||
"0.127\n", | ||
"0.030\n", | ||
"0.030\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"\"\"\"Calculate information-theoretic measures of distributional\n", | ||
"similarity based on word frequencies in two texts\n", | ||
"\"\"\"\n", | ||
"\n", | ||
"import collections\n", | ||
"import math\n", | ||
"\n", | ||
"\n", | ||
"def read_words(infile):\n", | ||
" with open(infile) as input_text:\n", | ||
" return [x.strip() for x in input_text.read().split()]\n", | ||
"\n", | ||
"\n", | ||
"def get_counts(word_list):\n", | ||
" return collections.Counter(word_list)\n", | ||
"\n", | ||
"\n", | ||
"def create_prob_dist(count_dict):\n", | ||
" total_ct = sum(count_dict.values())\n", | ||
" p = {x: ct / total_ct for x, ct in count_dict.items()}\n", | ||
" return p\n", | ||
"\n", | ||
"\n", | ||
"def count_smoothing(freq_dist, vocabulary, alpha=1):\n", | ||
" \"\"\"Implement simple count-based probability smoothing.\n", | ||
" Given a target vocabulary and a set of observed count frequencies,\n", | ||
" calculate a new set of counts so that Count(x) > 0 for all words\n", | ||
" in the target vocabulary. This is achieved by adding `alpha`\n", | ||
" to each observed count\n", | ||
" \"\"\"\n", | ||
" return {w: freq_dist.get(w, 0) + alpha for w in vocabulary}\n", | ||
"\n", | ||
"\n", | ||
"def entropy(p):\n", | ||
" \"\"\"Calculate entropy H(p) for a probability distribution represented\n", | ||
" as a mapping (dictionary) from word tokens to probabilities\n", | ||
" \"\"\"\n", | ||
" h = 0\n", | ||
"\n", | ||
" # TODO -- Calculate entropy value in nats for probability distribution `p`\n", | ||
" for x in p:\n", | ||
" h -= p[x] * math.log(p[x])\n", | ||
"\n", | ||
" return h\n", | ||
"\n", | ||
"\n", | ||
"def cross_entropy(p1, p2):\n", | ||
" \"\"\"Calculate cross-entropy H(p1, p2) for two probability distributions\n", | ||
" represented as a mapping (dictionary) from word tokens to\n", | ||
" probabilities\n", | ||
" \"\"\"\n", | ||
" xh = 0\n", | ||
"\n", | ||
" # TODO -- Calculate cross-entropy value H(p1, p2) in nats\n", | ||
" for x in p1:\n", | ||
" xh -= p1[x] * math.log(p2[x])\n", | ||
"\n", | ||
" return xh\n", | ||
"\n", | ||
"\n", | ||
"def kl_divergence(p1, p2):\n", | ||
" \"\"\"Calculate Kullback-Leibler divergence D_{KL}(p1||p2) for two\n", | ||
" probability distributions represented as a mapping (dictionary)\n", | ||
" from word tokens to probabilities\n", | ||
" \"\"\"\n", | ||
" kl = 0\n", | ||
"\n", | ||
" # TODO -- Calculate KL divergence D_{KL}(p1||p2) in nats\n", | ||
" kl = cross_entropy(p1, p2) - entropy(p1)\n", | ||
"\n", | ||
" return kl\n", | ||
"\n", | ||
"\n", | ||
"def js_divergence(p1, p2):\n", | ||
" \"\"\"Calculate Jensen-Shannon divergence D_{JS}(p1||p2) for two\n", | ||
" probability distributions represented as a mapping (dictionary)\n", | ||
" from word tokens to probabilities\n", | ||
" \"\"\"\n", | ||
" js = 0\n", | ||
"\n", | ||
" # TODO -- Calculate JS divergence D_{JS}(p1||p2) in nats\n", | ||
" m = {k: ((p1.get(k, 0) + p2.get(k, 0))/2.0) for k in p1.keys()} \n", | ||
" js = (kl_divergence(p1, m) + kl_divergence(p2, m))/2\n", | ||
" \n", | ||
" return js\n", | ||
"\n", | ||
"if __name__ == \"__main__\":\n", | ||
" \"\"\"Do not edit this code\n", | ||
" \"\"\"\n", | ||
" words_a = read_words(\"test_a.txt\")\n", | ||
" words_b = read_words(\"test_b.txt\")\n", | ||
"\n", | ||
" ct_a = get_counts(words_a)\n", | ||
" ct_b = get_counts(words_b)\n", | ||
"\n", | ||
" vocab = set(ct_a.keys()) | set(ct_b.keys())\n", | ||
" ct_a = count_smoothing(ct_a, vocab)\n", | ||
" ct_b = count_smoothing(ct_b, vocab)\n", | ||
"\n", | ||
" p_a = create_prob_dist(ct_a)\n", | ||
" p_b = create_prob_dist(ct_b)\n", | ||
"\n", | ||
" h_a = entropy(p_a)\n", | ||
" h_b = entropy(p_b)\n", | ||
" xh_ab = cross_entropy(p_a, p_b)\n", | ||
" xh_ba = cross_entropy(p_b, p_a)\n", | ||
" kl_ab = kl_divergence(p_a, p_b)\n", | ||
" kl_ba = kl_divergence(p_b, p_a)\n", | ||
" js_ab = js_divergence(p_a, p_b)\n", | ||
" js_ba = js_divergence(p_b, p_a)\n", | ||
"\n", | ||
" for metric in [h_a, h_b, xh_ab, xh_ba,\n", | ||
" kl_ab, kl_ba, js_ab, js_ba]:\n", | ||
" print(\"{:.3f}\".format(metric))\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
# Chandni Patel | ||
# CS 585 - NLP | ||
# Homework 1 | ||
|
||
"""Calculate information-theoretic measures of distributional | ||
similarity based on word frequencies in two texts | ||
""" | ||
|
||
import collections | ||
import math | ||
|
||
|
||
def read_words(infile): | ||
with open(infile) as input_text: | ||
return [x.strip() for x in input_text.read().split()] | ||
|
||
|
||
def get_counts(word_list): | ||
return collections.Counter(word_list) | ||
|
||
|
||
def create_prob_dist(count_dict): | ||
total_ct = sum(count_dict.values()) | ||
p = {x: ct / total_ct for x, ct in count_dict.items()} | ||
return p | ||
|
||
|
||
def count_smoothing(freq_dist, vocabulary, alpha=1): | ||
"""Implement simple count-based probability smoothing. | ||
Given a target vocabulary and a set of observed count frequencies, | ||
calculate a new set of counts so that Count(x) > 0 for all words | ||
in the target vocabulary. This is achieved by adding `alpha` | ||
to each observed count | ||
""" | ||
return {w: freq_dist.get(w, 0) + alpha for w in vocabulary} | ||
|
||
|
||
def entropy(p): | ||
"""Calculate entropy H(p) for a probability distribution represented | ||
as a mapping (dictionary) from word tokens to probabilities | ||
""" | ||
h = 0 | ||
|
||
# TODO -- Calculate entropy value in nats for probability distribution `p` | ||
for x in p: | ||
h -= p[x] * math.log(p[x]) | ||
|
||
return h | ||
|
||
|
||
def cross_entropy(p1, p2): | ||
"""Calculate cross-entropy H(p1, p2) for two probability distributions | ||
represented as a mapping (dictionary) from word tokens to | ||
probabilities | ||
""" | ||
xh = 0 | ||
|
||
# TODO -- Calculate cross-entropy value H(p1, p2) in nats | ||
for x in p1: | ||
xh -= p1[x] * math.log(p2[x]) | ||
|
||
return xh | ||
|
||
|
||
def kl_divergence(p1, p2): | ||
"""Calculate Kullback-Leibler divergence D_{KL}(p1||p2) for two | ||
probability distributions represented as a mapping (dictionary) | ||
from word tokens to probabilities | ||
""" | ||
kl = 0 | ||
|
||
# TODO -- Calculate KL divergence D_{KL}(p1||p2) in nats | ||
# D_{KL}(p1||p2) = H(p1,p2) - H(p1) | ||
kl = cross_entropy(p1, p2) - entropy(p1) | ||
|
||
return kl | ||
|
||
|
||
def js_divergence(p1, p2): | ||
"""Calculate Jensen-Shannon divergence D_{JS}(p1||p2) for two | ||
probability distributions represented as a mapping (dictionary) | ||
from word tokens to probabilities | ||
""" | ||
js = 0 | ||
|
||
# TODO -- Calculate JS divergence D_{JS}(p1||p2) in nats | ||
# M = (p1+p2)/2 | ||
m = {k: ((p1.get(k, 0) + p2.get(k, 0))/2.0) for k in p1.keys()} | ||
# D_{JS}(p1||p2) = (D_{KL}(p1||M) + D_{KL}(p2||M))/2 | ||
js = (kl_divergence(p1, m) + kl_divergence(p2, m))/2 | ||
|
||
return js | ||
|
||
if __name__ == "__main__": | ||
"""Do not edit this code | ||
""" | ||
words_a = read_words("test_a.txt") | ||
words_b = read_words("test_b.txt") | ||
|
||
ct_a = get_counts(words_a) | ||
ct_b = get_counts(words_b) | ||
|
||
vocab = set(ct_a.keys()) | set(ct_b.keys()) | ||
ct_a = count_smoothing(ct_a, vocab) | ||
ct_b = count_smoothing(ct_b, vocab) | ||
|
||
p_a = create_prob_dist(ct_a) | ||
p_b = create_prob_dist(ct_b) | ||
|
||
h_a = entropy(p_a) | ||
h_b = entropy(p_b) | ||
xh_ab = cross_entropy(p_a, p_b) | ||
xh_ba = cross_entropy(p_b, p_a) | ||
kl_ab = kl_divergence(p_a, p_b) | ||
kl_ba = kl_divergence(p_b, p_a) | ||
js_ab = js_divergence(p_a, p_b) | ||
js_ba = js_divergence(p_b, p_a) | ||
|
||
for metric in [h_a, h_b, xh_ab, xh_ba, | ||
kl_ab, kl_ba, js_ab, js_ba]: | ||
print("{:.3f}".format(metric)) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
2.672 | ||
2.619 | ||
2.791 | ||
2.746 | ||
0.119 | ||
0.127 | ||
0.030 | ||
0.030 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
the quick brown fox jumped over the lazy dog the spry red dog jumped near the indolent fox the lazy cat never jumped |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
the dog and the cat jumped over the fox the red and brown fox jumped over the mouse |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#!/bin/bash | ||
|
||
# Replace this line with a sequence of shell commands connected with Unix pipes ("|") | ||
|
||
# replace spaces with new line to get list of words | ||
# then sort all words and get unique works | ||
# remove empty lines | ||
|
||
tr ' ' '\n' | sort | uniq | sed -r '/^\s*$/d' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
the quick brown fox jumped over the lazy dog the spry red dog jumped near the indolent fox the lazy cat never jumped | ||
the dog and the cat jumped over the fox the red and brown fox jumped over the mouse | ||
quick brown foxes jumped and lazy dogs slept |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
and | ||
brown | ||
cat | ||
dog | ||
dogs | ||
fox | ||
foxes | ||
indolent | ||
jumped | ||
lazy | ||
mouse | ||
near | ||
never | ||
over | ||
quick | ||
red | ||
slept | ||
spry | ||
the |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/bin/bash | ||
|
||
# Replace this line with one or more shell commands | ||
# You may write to intermediate text files on disk if necessary | ||
|
||
# cut and paste columns for comparing | ||
cut -d ',' -f 3,5 > col35.txt | ||
cut -d ',' -f 1 < col35.txt | awk '{print $1}' > col3.txt | ||
cut -d ',' -f 2 < col35.txt > col5.txt | ||
paste -d ' ' col3.txt col5.txt > col35.txt | ||
# compare and add lines | ||
awk '{for(i=2;i<=NF;i++) | ||
{ | ||
if($1 == $i) | ||
{ | ||
print $1 > "lines.txt"; | ||
break;} | ||
} | ||
}' col35.txt | ||
wc -l lines.txt | tr ' ' '\n' | head -n 1 | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
the quick brown,fox jumped over the lazy,dog the,spry,red dog jumped,near the indolent fox,the lazy cat never jumped | ||
the dog,and the,cat jumped,over the,fox the red and brown,fox jumped,over the mouse | ||
a brown,fox and,a red,dog jumped,over the,mouse and,the cat | ||
the cat,slept and,the dog,jumped over,the mouse,while the fox,slept |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
the quick brown,fox jumped over the lazy,dog the,spry,red dog jumped,near the indolent fox,the lazy cat never jumped | ||
the dog,and the,cat jumped,over the,fox the red and brown,fox jumped,over the mouse |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/bin/bash | ||
|
||
# Replace this line with one or more shell commands | ||
# You may write to intermediate text files on disk if necessary | ||
|
||
ls test_*.txt | for file in $(ls test_*.txt) | ||
do | ||
#The second field should be the number of lines in the file that include "the" as a word. | ||
numthe=$(cat $file | grep -c -E '\b[Tt]he\b') | ||
#The third field should be the number of lines in the file that include "a" as a word. | ||
numa=$(cat $file | grep -c -E '\b[Aa]\b') | ||
#The fourth field should be the number of lines in the file that include "an" as a word. | ||
numan=$(cat $file | grep -c -E '\b[Aa]n\b') | ||
echo "$file,$numthe,$numa,$numan" | ||
done | ||
|
Oops, something went wrong.