Skip to content

WIP: feature extractors #587

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 38 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions big-data-utils/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<artifactId>illinois-cogcomp-nlp</artifactId>
<groupId>edu.illinois.cs.cogcomp</groupId>
<version>3.1.36</version>
<version>3.1.39</version>
</parent>

<modelVersion>4.0.0</modelVersion>
Expand All @@ -23,7 +23,7 @@
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-core-utilities</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>org.xeustechnologies.google-api</groupId>
Expand Down
10 changes: 5 additions & 5 deletions chunker/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<parent>
<artifactId>illinois-cogcomp-nlp</artifactId>
<groupId>edu.illinois.cs.cogcomp</groupId>
<version>3.1.36</version>
<version>3.1.39</version>
</parent>

<modelVersion>4.0.0</modelVersion>
Expand All @@ -13,7 +13,7 @@
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-core-utilities</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>

<dependency>
Expand All @@ -24,12 +24,12 @@
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>LBJava-NLP-tools</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-pos</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
Expand All @@ -44,7 +44,7 @@
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-curator</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
Expand Down
20 changes: 10 additions & 10 deletions commasrl/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<parent>
<artifactId>illinois-cogcomp-nlp</artifactId>
<groupId>edu.illinois.cs.cogcomp</groupId>
<version>3.1.36</version>
<version>3.1.39</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down Expand Up @@ -35,48 +35,48 @@
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-core-utilities</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-curator</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-tokenizer</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-corpusreaders</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-inference</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>stanford_3.3.1</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-pos</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-ner</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-chunker</artifactId>
<version>3.1.36</version>
<version>3.1.39</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
Expand Down
2 changes: 1 addition & 1 deletion core-utilities/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<parent>
<artifactId>illinois-cogcomp-nlp</artifactId>
<groupId>edu.illinois.cs.cogcomp</groupId>
<version>3.1.36</version>
<version>3.1.39</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
*/
package edu.illinois.cs.cogcomp.core.datastructures;

import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntFloatHashMap;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.procedure.TIntIntProcedure;
Expand All @@ -17,12 +19,13 @@
import java.io.*;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

/**
* A lexicon manager that manages features. Stores a hash value for string features and maps to an
* integer id. Optionally stores the string values too. Method previewFeature( String ) gets the
* integer id. Optionally stores the string values too.
*
* @author Vivek Srikumar
*/
Expand Down Expand Up @@ -72,6 +75,10 @@ public Lexicon(InputStream in) throws IOException {
this(in, false);
}

public Lexicon(File f, boolean loadStrings) throws IOException {
this(new FileInputStream(f), loadStrings);
}

public Lexicon(InputStream in, boolean loadStrings) throws IOException {
GZIPInputStream zipin = new GZIPInputStream(in);

Expand Down Expand Up @@ -142,6 +149,10 @@ public String lookupName(int id) {
return featureNames.get(id);
}

public List<String> getFeatureNames() {
return this.featureNames;
}

/**
* Increment the count for featureId.
*/
Expand Down Expand Up @@ -175,15 +186,13 @@ public synchronized void previewFeature(String f) {

// If there is a hash collision, print a warning
if (feature2Id.containsKey(featureHash)) {
logger.warn("Possible hash collision in lexicon " + "for feature name = {}, hash = {}", f,
logger.warn("Possible hash collision in lexicon for feature name = {}, hash = {}", f,
featureHash);
} else {

feature2Id.put(featureHash, nextFeatureId++);
}

if (featureNames != null) {
featureNames.add(f);
if (featureNames != null) {
featureNames.add(f);
}
}
}

Expand Down Expand Up @@ -249,6 +258,27 @@ public Pair<int[], float[]> getFeatureVector(Map<String, Float> featureMap) {
return new Pair<>(ids, vals);
}

/**
* generate a feature id representation given a set of features given as input
* @param features set of active features
* @return a feature sparse representation of the features
*/
public int[] getFeatureVector(List<String> features) {
TIntList feats = new TIntArrayList();
for (String f : features) {
if (!contains(f))
continue;
int id = lookupId(f);
if (!feats.contains(id))
feats.add(id);
}
return feats.toArray();
}

public TIntIntHashMap getFeatureMap() {
return feature2Id;
}

public Pair<int[], float[]> pruneFeaturesByCount(int[] idx, float[] fs, int threshold) {
int[] array = new int[idx.length];
float[] vals = new float[array.length];
Expand Down Expand Up @@ -314,19 +344,15 @@ public void save(String file) throws IOException {

writeInt(writer, feature2Id.size());

feature2Id.forEachEntry(new TIntIntProcedure() {

@Override
public boolean execute(int a, int b) {
try {
writeInt(writer, a);
writeInt(writer, b);
feature2Id.forEachEntry((hash, id) -> {
try {
writeInt(writer, hash);
writeInt(writer, id);

} catch (IOException e) {
throw new RuntimeException(e);
}
return true;
} catch (IOException e) {
throw new RuntimeException(e);
}
return true;
});

if (featureNames != null) {
Expand Down Expand Up @@ -354,21 +380,43 @@ private void writeInt(BufferedWriter writer, int integer) throws IOException {

/***
* prunes the lexicon by removing features with less than threshold many counts
* If true, it would include the feature counts in the new generated lexicon
* @param keepCounts whether to keep the feature counts in the pruned feature map or not.
* @param resetFeatureIds this would map features to another counting, starting from zero. This is usefull in
* the cases where pruning drops many of the features, and leaves many of the ids unused.
*/
public Lexicon getPrunedLexicon(final int threshold) {
final Lexicon lex = new Lexicon(false, false);

this.feature2Id.forEachEntry(new TIntIntProcedure() {
public Lexicon getPrunedLexicon(final int threshold, boolean keepCounts, boolean resetFeatureIds, boolean hasBias, boolean storeStrings) {
final Lexicon lex = new Lexicon(hasBias, storeStrings);

@Override
public boolean execute(int hash, int id) {
AtomicInteger nextId = new AtomicInteger(-1);

if (featureCounts.get(id) > threshold)
lex.feature2Id.put(hash, id);
return true;
this.feature2Id.forEachEntry((hash, id) -> {
String featureName = "";
if(storeStrings && this.featureNames != null) {
featureName = this.featureNames.get(id);
}
int count = featureCounts.get(id);
if (count > threshold) {
int newId;
if(resetFeatureIds)
newId = nextId.incrementAndGet();
else
newId = id;
lex.feature2Id.put(hash, newId);
if(keepCounts) lex.featureCounts.put(newId, count);
if(storeStrings && this.featureNames != null) {
// expand the
for (int i = lex.featureNames.size(); i <= newId; i++)
lex.featureNames.add("");
lex.featureNames.set(newId, featureName);
}
}
return true;
});
lex.nextFeatureId = this.nextFeatureId;
if(resetFeatureIds)
lex.nextFeatureId = nextId.incrementAndGet();
else
lex.nextFeatureId = this.nextFeatureId;

logger.info("Number of features after pruning: " + lex.size());

Expand Down
Loading