Skip to content

Add standoff output format to NER main #724

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 42 additions & 5 deletions ner/src/main/java/edu/illinois/cs/cogcomp/ner/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,14 @@
*/
public class Main extends AbstractMain {


enum OutputFormat {
BRACKETED,
CONLL,
STANDOFF
}
/** set to true to produce bracketed output, otherwise CoNLL output. */
boolean bracketedOutput = true;
OutputFormat outFormat = OutputFormat.BRACKETED;

/**
* enumerates the various input processing states.
Expand Down Expand Up @@ -118,7 +124,9 @@ public Main(String[] args) {
@Override
protected int processArgument(String[] args, int current) throws Exception {
if (args[current].equals("-c"))
bracketedOutput = false;
outFormat = OutputFormat.CONLL;
if (args[current].equals("-s"))
outFormat = OutputFormat.STANDOFF;
else {
if (new File(args[current]).exists()) {
System.out.println("Loading properties from " + args[current]);
Expand All @@ -133,9 +141,10 @@ protected int processArgument(String[] args, int current) throws Exception {

@Override
protected String getCommandSyntax() {
return "java -Xms2g edu.illinois.cs.cogcomp.ner.Main -b <config_file_name> [-c]\n"
return "java -Xms2g edu.illinois.cs.cogcomp.ner.Main -b <config_file_name> [-c|-s]\n"
+ " <config_file_name> : specify the location of a configuration file.\n"
+ " -c : produce output in CoNLL 2002 format, by default, output in bracketed format.";
+ " -c : produce output in CoNLL 2002 format, by default, output in bracketed format.\n"
+ " -s : produce output in a standoff format with columns (start char, end char, label, text).";
}

/**
Expand Down Expand Up @@ -476,8 +485,10 @@ else if (token.getEndCharOffset() <= entity.getEndCharOffset())
* @return the string with the data.
*/
private String produceOutput(View nerView, TextAnnotation ta) {
if (bracketedOutput)
if (outFormat == OutputFormat.BRACKETED)
return this.produceBracketedAnnotations(nerView, ta);
else if (outFormat == OutputFormat.STANDOFF)
return this.produceStandoffAnnotations(nerView, ta);
else
return this.produceCoNLL2002Annotations(nerView, ta);
}
Expand Down Expand Up @@ -514,6 +525,32 @@ private String produceBracketedAnnotations(View nerView, TextAnnotation ta) {
return sb.toString();
}

/**
* Render a string of standoff annotations in tab-delimited format
*
* @param nerView the NER label view.
* @param ta the text annotation.
* @return standoff annotations in (start char, end char, label, text) tab-delimited format
*/
private String produceStandoffAnnotations(View nerView, TextAnnotation ta) {
StringBuilder sb = new StringBuilder();
List<Constituent> constituents = new ArrayList<>(nerView.getConstituents());
Collections.sort(constituents, TextAnnotationUtilities.constituentStartComparator);
String text = ta.getText();
int where = 0;
for (Constituent c : constituents) {
sb.append(c.getStartCharOffset());
sb.append("\t");
sb.append(c.getEndCharOffset());
sb.append("\t");
sb.append(c.getLabel());
sb.append("\t");
sb.append(c.getTokenizedSurfaceForm());
sb.append("\n");
}
return sb.toString();
}

/**
* process the single input string, produce output on standard out if no output directory is
* defined, or produce the output in the output directory by the same file name as the input
Expand Down