diff --git a/src/com/jgaap/backend/API.java b/src/com/jgaap/backend/API.java index 87630a6ed..62b1faeae 100644 --- a/src/com/jgaap/backend/API.java +++ b/src/com/jgaap/backend/API.java @@ -1,849 +1,905 @@ -/* - * JGAAP -- a graphical program for stylometric authorship attribution - * Copyright (C) 2009,2011 by Patrick Juola - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ -package com.jgaap.backend; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Set; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; - -import org.apache.log4j.Logger; - -import com.jgaap.classifiers.LeaveOneOutNoDistanceDriver; -import com.jgaap.generics.AnalysisDriver; -import com.jgaap.generics.AnalyzeException; -import com.jgaap.generics.CanonicizationException; -import com.jgaap.generics.Canonicizer; -import com.jgaap.generics.DistanceFunction; -import com.jgaap.generics.EventCuller; -import com.jgaap.generics.EventCullingException; -import com.jgaap.generics.EventDriver; -import com.jgaap.generics.EventGenerationException; -import com.jgaap.generics.Language; -import com.jgaap.generics.LanguageParsingException; -import com.jgaap.generics.NeighborAnalysisDriver; -import com.jgaap.generics.NonDistanceDependentAnalysisDriver; -import com.jgaap.generics.ValidationDriver; -import com.jgaap.generics.WEKAAnalysisDriver; -import com.jgaap.languages.English; -import com.jgaap.util.Document; -import com.jgaap.util.EventSet; - -/** - * - * This class provides a simple interface into jgaap for use in - * other software packages and for development of any human interfaces. - * - * Instructions for using the JGAAP API: - * - * First add documents both known and unknown - * - * All other settings can be performed in any order which are setLanguage, addCanonicizer, - * addEventDriver, addEventCuller, addAnalysisDriver, addDistanceFunction - * Note: of the settings only one EventDriver and one AnalysisDriver are required to run an experiment - * - * The execute method is then used to start the experiment running - * - * Results are placed in unknown documents to access them simple use the getUnknownDocuments method in the API - * The results can be retrieved as a List> this is a sorted list - * from most likely to least likely author followed by a score generated based on your settings using the getRawResult method - * You can also get a Map of Maps of the raw results (Map>>) with the getRawResults method - * They can also be retrieved as a string using either the getFormattedResult or getResult methods. - * - * For examples of how to use the API class see the com.jgaap.ui package for a GUI example - * or the com.jgaap.backend.CLI class for a command line example - * - * @author Michael Ryan - * @since 5.0.0 - */ -public class API { - - static Logger logger = Logger.getLogger(API.class); - - private List documents; - private Language language; - private List eventDrivers; - private List eventCullers; - private List analysisDrivers; - - private ExecutorService executor; - - private static final API INSTANCE = new API(); - - private API() { - documents = new ArrayList(); - language = new English(); - eventDrivers = new ArrayList(); - eventCullers = new ArrayList(); - analysisDrivers = new ArrayList(); - } - - /** - * This allows a singleton of the api to be used in the gui - * or any program that needs to access a single copy of JGAAP - * from multiple classes - * - * @return a reference to the singleton API - */ - public static API getInstance(){ - return INSTANCE; - } - - /** - * This is a unique instance of the api to be used when running - * bulk experiments and you want to reset everything or if you - * want to thread running more than one experiment at a time - * as in the class com.jgaap.backend.ExperimentEngine - * - * @return a unique API instance - */ - public static API getPrivateInstance(){ - return new API(); - } - - /** - * - * This allows for the addition of documents to the system. - * Both Training (known) and Sample (unknown) documents must be provided before running an experiment. - * Training Documents are added by providing an author(tag) for them. - * Sample documents are added when no author(tag) is given. - * - * @param filepath - the system file path or URL to a document - * @param author - the author of this document or the tag being applied to this document, if null or the empty string this document is considered unknown and is one of those classified - * @param title - Some means of identifying the document, if null or the empty string are provided a title will be generated from the file name - * @return - a reference to the document generated - * @throws Exception - if there is a problem loading the document from file web or parsing file format - */ - public Document addDocument(String filepath, String author, String title) - throws Exception { - Document document = new Document(filepath, author, title); - return addDocument(document); - } - - /** - * Adds a previously generated document to the jgaap system. - * - * @param document - a file that has already been loaded as a Document - * @return - a reference to the document generated - */ - public Document addDocument(Document document) { - documents.add(document); - logger.info("Adding Document "+document.toString()); - return document; - } - - /** - * Removes a document from the system. - * - * @param document - a reference to the document that is to be removed - * @return - true on success false on failure - */ - public Boolean removeDocument(Document document) { - logger.info("Removing Document "+document.toString()); - return documents.remove(document); - } - - /** - * Removes all documents loaded into the system. - */ - public void removeAllDocuments() { - logger.info("Removing all Documents"); - documents.clear(); - } - - /** - * Get a List of all Documents currently loaded into jgaap - * - * @return - a List of Documents loaded into the system - */ - public List getDocuments() { - return documents; - } - - /** - * Get a List of all currently loaded Documents that do not have an author(tag) - * - * @return List of Documents without authors - */ - public List getUnknownDocuments() { - List unknownDocuments = new ArrayList(); - for (Document document : documents) { - if (!document.isAuthorKnown()) { - unknownDocuments.add(document); - } - } - return unknownDocuments; - } - - /** - * Get a List of Documents currently loaded into the system that have a author(tag) - * - * @return List of Documents with authors - */ - public List getKnownDocuments() { - List knownDocuments = new ArrayList(); - for (Document document : documents) { - if (document.isAuthorKnown()) { - knownDocuments.add(document); - } - } - return knownDocuments; - } - - /** - * Get a List of Documents that all have the same author(tag) - * - * @param author - the author(tag) to select documents on - * @return - List of Documents limited by the author provided - */ - public List getDocumentsByAuthor(String author) { - List authorDocuments = new ArrayList(); - for (Document document : documents) { - if (document.isAuthorKnown()) { - if (author.equalsIgnoreCase(document.getAuthor())) { - authorDocuments.add(document); - } - } - } - return authorDocuments; - } - - /** - * Get a List of all unique authors(tags) applied to Known(Training) Documents - * - * @return List of authors - */ - public List getAuthors() { - Set authors = new HashSet(); - for (Document document : documents) { - if (document.isAuthorKnown()) { - authors.add(document.getAuthor()); - } - } - List authorsList = new ArrayList(authors); - Collections.sort(authorsList); - return authorsList; - } - - /** - * Loads the documents from the file system - * @throws Exception - */ - public void loadDocuments() throws Exception{ - for(Document document : documents){ - document.load(); - } - } - - /** - * Adds the specified canonicizer to all documents currently loaded in the system. - * - * @param action - the unique string name representing a canonicizer (displayName()) - * @return - a reference to the canonicizer added - * @throws Exception - if the canonicizer specified cannot be found or instanced - */ - public Canonicizer addCanonicizer(String action) throws Exception { - Canonicizer canonicizer = Canonicizers.getCanonicizer(action); - for (Document document : documents) { - addCanonicizer(canonicizer, document); - } - return canonicizer; - } - - /** - * Adds the specified canonicizer to all Documents that have the DocType docType. - * - * @param action - the unique string name representing a canonicizer (displayName()) - * @param docType - The DocType this canonicizer is restricted to - * @return - a reference to the canonicizer added - * @throws Exception - if the canonicizer specified cannot be found or instanced - */ - public Canonicizer addCanonicizer(String action, Document.Type docType) throws Exception { - Canonicizer canonicizer = Canonicizers.getCanonicizer(action); - for (Document document : documents) { - if (document.getDocType().equals(docType)) { - addCanonicizer(canonicizer, document); - } - } - return canonicizer; - } - - /** - * Add the Canonicizer specified to the document referenced. - * - * @param action - the unique string name representing a canonicizer (displayName()) - * @param document - the Document to add the canonicizer to - * @return - a reference to the canonicizer added - * @throws Exception - if the canonicizer specified cannot be found or instanced - */ - public Canonicizer addCanonicizer(String action, Document document) - throws Exception { - Canonicizer canonicizer = Canonicizers.getCanonicizer(action); - return addCanonicizer(canonicizer, document); - } - - /** - * Add the Canonicizer specified to the document referenced. - * - * @param canonicizer - the canonicizer to add - * @param document - the Document to add the canonicizer to - * @return - a reference to the canonicizer added - */ - public Canonicizer addCanonicizer(Canonicizer canonicizer, Document document) { - document.addCanonicizer(canonicizer); - logger.info("Adding Canonicizer "+canonicizer.displayName()+" to Document "+document.toString()); - return canonicizer; - } - - public Canonicizer addCanonicizer(String action, EventDriver eventDriver) throws Exception { - Canonicizer canonicizer = Canonicizers.getCanonicizer(action); - return addCanonicizer(canonicizer, eventDriver); - } - - public Canonicizer addCanonicizer(Canonicizer canonicizer, EventDriver eventDriver) { - eventDriver.addCanonicizer(canonicizer); - logger.info("Adding Canonicizer "+canonicizer.displayName()+" to EventDriver "+eventDriver.displayName()); - return canonicizer; - } - - /** - * Removes the first instance of the canoniciser corresponding to the action(displayName()) - * from the Document referenced. - * - * @param canonicizer - the canonicizer to be removed - * @param document - a reference to the Document to remove the canonicizer from - */ - public void removeCanonicizer(Canonicizer canonicizer, Document document) { - document.removeCanonicizer(canonicizer); - } - - public void removeCanonicizer(Canonicizer canonicizer, EventDriver eventDriver) { - eventDriver.removeCanonicizer(canonicizer); - } - - /** - * Removes the first occurrence of the canonicizer corresponding to the action(displayName()) - * from every document - * - * @param canonicizer - the canonicizer to be removed - */ - public void removeCanonicizer(Canonicizer canonicizer) { - for (Document document : documents) { - removeCanonicizer(canonicizer, document); - } - } - - /** - * Removes the first occurrence of the canonicizer from every Document of the DocType docType - * - * @param canonicizer - the canonicizer to be removed - * @param docType - the DocType to remove the canonicizer from - */ - public void removeCanonicizer(Canonicizer canonicizer, Document.Type docType) { - for (Document document : documents) { - if (document.getDocType().equals(docType)) { - removeCanonicizer(canonicizer, document); - } - } - } - - /** - * Removes all canonicizers from Documents with the DocType docType - * - * @param docType - the DocType to remove canonicizers from - */ - public void removeAllCanonicizers(Document.Type docType) { - for (Document document : documents) { - document.clearCanonicizers(); - } - } - - /** - * Removes all canonicizers from All Documents loaded in the system - */ - public void removeAllCanonicizers() { - for (Document document : documents) { - document.clearCanonicizers(); - } - } - - /** - * Add an Event Driver which will be used to - * eventify(Generate a List of Events order in the sequence they are found in the document) - * all of the documents - * @param action - the identifier for the EventDriver to add (displayName()) - * @return - a reference to the added EventDriver - * @throws Exception - If the action is not found or the EventDriver cannot be instanced - */ - public EventDriver addEventDriver(String action) throws Exception { - EventDriver eventDriver = EventDrivers.getEventDriver(action); - return addEventDriver(eventDriver); - } - - /** - * Add an Event Driver which will be used to - * eventify(Generate a List of Events order in the sequence they are found in the document) - * all of the documents - * @param eventDriver - the EventDriver to add - * @return - a reference to the added EventDriver - */ - public EventDriver addEventDriver(EventDriver eventDriver) { - eventDrivers.add(eventDriver); - logger.info("Adding EventDriver "+eventDriver.displayName()); - return eventDriver; - } - - /** - * Removes the Event Driver reference from the system - * @param eventDriver - the EventDriver to be removed - * @return - true if successful false if failure - */ - public Boolean removeEventDriver(EventDriver eventDriver) { - logger.info("Removing EventDriver "+eventDriver.displayName()); - return eventDrivers.remove(eventDriver); - } - - /** - * Removes all EventDrivers from the system - */ - public void removeAllEventDrivers() { - eventDrivers.clear(); - for (Document document : documents) { - document.clearEventSets(); - } - } - - /** - * Gets a List of all EventDrivers currently loaded in the system - * @return List of All loaded EventDrivers - */ - public List getEventDrivers() { - return eventDrivers; - } - - /** - * Add an Event Culler to the system - * - * @param action - unique identifier for the event culler to add (displayName()) - * @return - a reference to the added event culler - * @throws Exception - if the EventCuller cannot be found or cannor be instanced - */ - public EventCuller addEventCuller(String action) throws Exception { - EventCuller eventCuller = EventCullers.getEventCuller(action); - eventCullers.add(eventCuller); - for(EventDriver eventDriver : eventDrivers) { - addEventCuller(eventCuller, eventDriver); - } - return eventCuller; - } - - public EventCuller addEventCuller(String action, EventDriver eventDriver) throws Exception { - EventCuller eventCuller = EventCullers.getEventCuller(action); - return addEventCuller(eventCuller, eventDriver); - } - - public EventCuller addEventCuller(EventCuller eventCuller, EventDriver eventDriver) { - eventDriver.addCuller(eventCuller); - logger.info("Adding EventCuller "+eventCuller.displayName()+" to "+eventDriver.displayName()); - return eventCuller; - } - - /** - * Remove the supplied EventCuller from the system - * - * @param eventCuller - EventCuller to be removed - * @return - true if success false if failure - */ - public Boolean removeEventCuller(EventCuller eventCuller) { - logger.info("Removing EventCuller "+eventCuller.displayName()); - eventCullers.remove(eventCuller); - for(EventDriver eventDriver : eventDrivers){ - eventDriver.removeCuller(eventCuller); - } - return true; - } - - /** - * Removes all loaded EventCullers from the system - */ - public void removeAllEventCullers() { - eventCullers.clear(); - for(EventDriver eventDriver : eventDrivers){ - eventDriver.clearCullers(); - } - } - - /** - * Get a List of all EventCullers currently loaded in the system - * @return List of EventCullers loaded - */ - public List getEventCullers() { - return eventCullers; - } - - /** - * Add an AnalysisDriver to the system as referenced by the action. - * - * @param action - the unique identifier for a AnalysisDriver (alternately a DistanceFunction) - * @return - a reference to the generated Analysis Driver - * @throws Exception - If the AnalysisDriver cannot be found or if it cannot be instanced - */ - public AnalysisDriver addAnalysisDriver(String action) throws Exception { - AnalysisDriver analysisDriver = AnalysisDrivers.getAnalysisDriver(action); - return addAnalysisDriver(analysisDriver); - } - - public AnalysisDriver addAnalysisDriver(AnalysisDriver analysisDriver) { - logger.info("Adding AnalysisDriver "+analysisDriver.displayName()); - analysisDrivers.add(analysisDriver); - return analysisDriver; - } - - /** - * Removed the passed AnalysisDriver from the system - * @param analysisDriver - reference to the AnalysisDriver to be removed - * @return True if success false if failure - */ - public Boolean removeAnalysisDriver(AnalysisDriver analysisDriver) { - logger.info("Removing AnalysisDriver "+analysisDriver.displayName()); - return analysisDrivers.remove(analysisDriver); - } - - /** - * Removes all AnalysisDrivers from the system - */ - public void removeAllAnalysisDrivers() { - analysisDrivers.clear(); - } - - /** - * Adds a DistanceFunction to the AnalysisDriver supplied. - * Only AnalysisDrivers that extend the NeighborAnalysisDriver can be used - * - * @param action - unique identifier for the DistanceFunction you want to add - * @param analysisDriver - a reference to the AnalysisDriver you want the distance added to - * @return - a reference to the generated DistanceFunction - * @throws Exception - if the AnalysisDriver does not extend NeighborAnalysisDriver or if the DistanceFunction cannot be found the DistanceFunction cannot be instanced - */ - public DistanceFunction addDistanceFunction(String action, - AnalysisDriver analysisDriver) throws Exception { - DistanceFunction distanceFunction = DistanceFunctions - .getDistanceFunction(action); - return addDistanceFunction(distanceFunction, analysisDriver); - } - - /** - * Adds a DistanceFunction to the AnalysisDriver supplied. - * Only AnalysisDrivers that extend the NeighborAnalysisDriver can be used - * - * @param distanceFunction - the DistanceFunction you want to add - * @param analysisDriver - a reference to the AnalysisDriver you want the distance added to - * @return - a reference to the generated DistanceFunction - */ - public DistanceFunction addDistanceFunction(DistanceFunction distanceFunction, AnalysisDriver analysisDriver) { - ((NeighborAnalysisDriver) analysisDriver).setDistance(distanceFunction); - return distanceFunction; - } - - /** - * @param action - unique identifier for the AnalysisDriver you want to add - * @param analysisDriver - a reference to the NonDistanceDependentAnalysisDriver you want - * the other driver added to - */ - public void addAnalysisDriverAsParamToOther(String action, NonDistanceDependentAnalysisDriver analysisDriver) - throws Exception { - analysisDriver.setAnalysisDriver(AnalysisDrivers.getAnalysisDriver(action)); - } - - /** - * Get a List of All AnalysisDrivers currently loaded on the system - * @return List of All AnalysisDrivers - */ - public List getAnalysisDrivers() { - return analysisDrivers; - } - - /** - * Get the current Language JGAAP is set to be working on - * @return - */ - public Language getLanguage(){ - return language; - } - - /** - * Set the Language that JGAAP will operate in. - * This restricts what methods are available, changes the charset that is expected when reading files, and will add any pre-processing that is needed - * @param action - the Language to operate under - * @return - a Reference to the language object selected - * @throws Exception - if the language cannot be found or cannot be instanced - */ - public Language setLanguage(String action) throws Exception { - language = Languages.getLanguage(action); - return language; - } - - /** - * Pipelines the independent aspects of loading and processing a document into separate threads - * - * Load the text from disk or the web - * Take into account any special treatment based on the language currently selected - * Place the text into canonical form using the Canonicizers - * Use the EventDrivers to transform the text into EventSets - * - * @throws Exception - */ - private void loadCanonicizeEventify() throws Exception{ - List> documentsProcessing = new ArrayList>(documents.size()); - for(final Document document : documents){ - Callable work = new Callable() { - @Override - public Document call() throws Exception { - try { - document.setLanguage(language); - document.load(); - document.processCanonicizers(); - for (EventDriver eventDriver : eventDrivers) { - char[] text = document.getText(); - for(Canonicizer canonicizer : eventDriver.getCanonicizers()){ - text = canonicizer.process(text); - } - try{ - document.addEventSet(eventDriver,eventDriver.createEventSet(text)); - } catch (EventGenerationException e) { - logger.error("Could not Eventify with "+eventDriver.displayName()+" on File:"+document.getFilePath()+" Title:"+document.getTitle(),e); - throw new Exception("Could not Eventify with "+eventDriver.displayName()+" on File:"+document.getFilePath()+" Title:"+document.getTitle(),e); - } - } - document.setText(""); - } catch (LanguageParsingException e) { - logger.fatal("Could not Parse Language: "+language.displayName()+" on File:"+document.getFilePath()+" Title:"+document.getTitle(),e); - document.failed(); - } catch (CanonicizationException e) { - logger.fatal("Could not Canonicize File: "+document.getFilePath()+" Title:"+document.getTitle(),e); - document.failed(); - } catch (Exception e) { - logger.fatal("Could not load File: "+document.getFilePath()+" Title:"+document.getTitle(),e); - document.failed(); - } - return document; - } - }; - documentsProcessing.add(executor.submit(work)); - } - - while(true){ - if(documentsProcessing.size()==0){ - break; - }else { - Iterator> documentIterator = documentsProcessing.iterator(); - while(documentIterator.hasNext()){ - Future futureDocument = documentIterator.next(); - if(futureDocument.isDone()){ - Document document = futureDocument.get(); - if(document.hasFailed()){ - throw new Exception("One or more documents could not be read / parsed / canonicized Experiment Failed"); - } - logger.info("Document: "+document.getTitle()+" has finished processing."); - documentIterator.remove(); - } - } - } - } - } - - /** - * Events are culled from EventSets across all Documents on a per EventDriver basis - * @throws EventCullingException - * @throws ExecutionException - * @throws InterruptedException - */ - private void cull() throws EventCullingException, InterruptedException, ExecutionException { - List> futureEventDrivers = new ArrayList>(); - for (EventDriver eventDriver : eventDrivers) { - if (!eventDriver.getEventCullers().isEmpty()) { - futureEventDrivers.add(executor.submit(new Culling(eventDriver))); - } - } - while(futureEventDrivers.size() != 0) { - Iterator> iterator = futureEventDrivers.iterator(); - while(iterator.hasNext()) { - Future futureEventDriver = iterator.next(); - if(futureEventDriver.isDone()){ - EventDriver eventDriver = futureEventDriver.get(); - logger.info("Finished Culling "+eventDriver.displayName()); - iterator.remove(); - } - } - } - } - - /** - * All loaded AnalysisDrivers are run over All EventSets comparing the Unknown(sample) to the Known(training) Documents. - */ - private void analyze() throws AnalyzeException { - List knownDocuments = new ArrayList(); - List unknownDocuments = new ArrayList(); - for (Document document : documents) { - if (document.isAuthorKnown()) { - knownDocuments.add(document); - } else { - unknownDocuments.add(document); - } - } - for (AnalysisDriver analysisDriver : analysisDrivers) { - logger.info("Training " + analysisDriver.displayName()); - analysisDriver.train(knownDocuments); - logger.info("Finished Training "+analysisDriver.displayName()); - List> futureDocuments = new ArrayList>(); - if (analysisDriver instanceof ValidationDriver || - analysisDriver instanceof LeaveOneOutNoDistanceDriver) { - for (Document knownDocument : knownDocuments) { - futureDocuments.add(executor.submit(new AnalysisWorker(knownDocument, analysisDriver))); - } - } else if (analysisDriver instanceof WEKAAnalysisDriver){ - for (Document unknownDocument : unknownDocuments){ - logger.info("Begining Analyzing: " + unknownDocument.toString()); - unknownDocument.addResult(analysisDriver, analysisDriver.analyze(unknownDocument)); - logger.info("Finished Analyzing: "+unknownDocument.toString()); - } - } else { - for (Document unknownDocument : unknownDocuments) { - futureDocuments.add(executor.submit(new AnalysisWorker(unknownDocument, analysisDriver))); - } - } - //await analysis to finish - while(futureDocuments.size() != 0){ - Iterator> iterator = futureDocuments.iterator(); - while(iterator.hasNext()) { - Future futureDocument = iterator.next(); - if(futureDocument.isDone()) { - iterator.remove(); - } - } - } - logger.info("Finished Analysis with "+analysisDriver.displayName()); - } - } - - /** - * Performs the canonicize eventify cull and analyze methods since a strict order has to be enforced when using them - * @throws Exception - */ - public void execute() throws Exception { - clearData(); - executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()); - loadCanonicizeEventify(); - cull(); - analyze(); - executor.shutdown(); - executor.awaitTermination(5, TimeUnit.SECONDS); - } - - /** - * Removes canonicizors from all documents - */ - public void clearCanonicizers() { - for(Document document : documents){ - document.clearCanonicizers(); - } - } - - /** - * Removes all Generated data from a run but leaves all settings untouched - */ - public void clearData() { - for(Document document : documents){ - document.clearEventSets(); - document.clearResults(); - } - } - - private class Culling implements Callable { - private EventDriver eventDriver; - private ExecutorService cullingExecutor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()); - - Culling(EventDriver eventDriver) { - this.eventDriver = eventDriver; - } - - @Override - public EventDriver call() throws Exception { - List eventSets = new ArrayList(); - for(Document document : documents){ - eventSets.add(document.getEventSet(eventDriver)); - } - for(EventCuller culler : eventDriver.getEventCullers()) { - culler.init(eventSets); - List> futureEventSets = new ArrayList>(eventSets.size()); - for(EventSet eventSet : eventSets) { - futureEventSets.add(cullingExecutor.submit(new CullerWorker(eventSet, culler))); - } - eventSets.clear(); - for(Future futureEventSet : futureEventSets) { - eventSets.add(futureEventSet.get()); - } - } - cullingExecutor.shutdown(); - for(int i = 0; i < documents.size(); i++) { - documents.get(i).addEventSet(eventDriver, eventSets.get(i)); - } - return eventDriver; - } - } - - private class CullerWorker implements Callable { - private EventSet eventSet; - private EventCuller culler; - - CullerWorker(EventSet eventSet, EventCuller culler) { - this.eventSet = eventSet; - this.culler = culler; - } - - public EventSet call() { - return culler.cull(eventSet); - } - } - - private class AnalysisWorker implements Callable { - private Document document; - private AnalysisDriver analysisDriver; - - AnalysisWorker(Document document, AnalysisDriver analysisDriver){ - this.document = document; - this.analysisDriver = analysisDriver; - } - - @Override - public Document call() throws Exception { - logger.info("Begining Analyzing: " + document.toString()); - document.addResult(analysisDriver, analysisDriver.analyze(document)); - logger.info("Finished Analyzing: "+document.toString()); - return document; - } - } -} +/* + * JGAAP -- a graphical program for stylometric authorship attribution + * Copyright (C) 2009,2011 by Patrick Juola + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +package com.jgaap.backend; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; + +import org.apache.log4j.Logger; + +import com.jgaap.classifiers.LeaveOneOutNoDistanceDriver; +import com.jgaap.generics.AnalysisDriver; +import com.jgaap.generics.AnalyzeException; +import com.jgaap.generics.CanonicizationException; +import com.jgaap.generics.Canonicizer; +import com.jgaap.generics.DistanceFunction; +import com.jgaap.generics.EventCuller; +import com.jgaap.generics.EventCullingException; +import com.jgaap.generics.EventDriver; +import com.jgaap.generics.EventGenerationException; +import com.jgaap.generics.Language; +import com.jgaap.generics.LanguageParsingException; +import com.jgaap.generics.NeighborAnalysisDriver; +import com.jgaap.generics.NonDistanceDependentAnalysisDriver; +import com.jgaap.generics.ValidationDriver; +import com.jgaap.generics.WEKAAnalysisDriver; +import com.jgaap.languages.English; +import com.jgaap.util.Document; +import com.jgaap.util.EventSet; + +/** + * + * This class provides a simple interface into jgaap for use in + * other software packages and for development of any human interfaces. + * + * Instructions for using the JGAAP API: + * + * First add documents both known and unknown + * + * All other settings can be performed in any order which are setLanguage, addCanonicizer, + * addEventDriver, addEventCuller, addAnalysisDriver, addDistanceFunction + * Note: of the settings only one EventDriver and one AnalysisDriver are required to run an experiment + * + * The execute method is then used to start the experiment running + * + * Results are placed in unknown documents to access them simple use the getUnknownDocuments method in the API + * The results can be retrieved as a List> this is a sorted list + * from most likely to least likely author followed by a score generated based on your settings using the getRawResult method + * You can also get a Map of Maps of the raw results (Map>>) with the getRawResults method + * They can also be retrieved as a string using either the getFormattedResult or getResult methods. + * + * For examples of how to use the API class see the com.jgaap.ui package for a GUI example + * or the com.jgaap.backend.CLI class for a command line example + * + * @author Michael Ryan + * @since 5.0.0 + */ +public class API { + + static Logger logger = Logger.getLogger(API.class); + + private List documents; + private Language language; + private List eventDrivers; + private List eventCullers; + private List analysisDrivers; + private ExecutorService executor; + + private static final API INSTANCE = new API(); + + private API() { + documents = new ArrayList(); + language = new English(); + eventDrivers = new ArrayList(); + eventCullers = new ArrayList(); + analysisDrivers = new ArrayList(); + } + + /** + * This allows a singleton of the api to be used in the gui + * or any program that needs to access a single copy of JGAAP + * from multiple classes + * + * @return a reference to the singleton API + */ + public static API getInstance(){ + return INSTANCE; + } + + /** + * This is a unique instance of the api to be used when running + * bulk experiments and you want to reset everything or if you + * want to thread running more than one experiment at a time + * as in the class com.jgaap.backend.ExperimentEngine + * + * @return a unique API instance + */ + public static API getPrivateInstance(){ + return new API(); + } + + /** + * + * This allows for the addition of documents to the system. + * Both Training (known) and Sample (unknown) documents must be provided before running an experiment. + * Training Documents are added by providing an author(tag) for them. + * Sample documents are added when no author(tag) is given. + * + * @param filepath - the system file path or URL to a document + * @param author - the author of this document or the tag being applied to this document, if null or the empty string this document is considered unknown and is one of those classified + * @param title - Some means of identifying the document, if null or the empty string are provided a title will be generated from the file name + * @return - a reference to the document generated + * @throws Exception - if there is a problem loading the document from file web or parsing file format + */ + public Document addDocument(String filepath, String author, String title) + throws Exception { + Document document = new Document(filepath, author, title); + return addDocument(document); + } + + /** + * Adds a previously generated document to the jgaap system. + * + * @param document - a file that has already been loaded as a Document + * @return - a reference to the document generated + */ + public Document addDocument(Document document) { + documents.add(document); + logger.info("Adding Document "+document.toString()); + return document; + } + + /** + * Removes a document from the system. + * + * @param document - a reference to the document that is to be removed + * @return - true on success false on failure + */ + public Boolean removeDocument(Document document) { + logger.info("Removing Document "+document.toString()); + return documents.remove(document); + } + + /** + * Removes all documents loaded into the system. + */ + public void removeAllDocuments() { + logger.info("Removing all Documents"); + documents.clear(); + } + + /** + * Get a List of all Documents currently loaded into jgaap + * + * @return - a List of Documents loaded into the system + */ + public List getDocuments() { + return documents; + } + + /** + * Get a List of all currently loaded Documents that do not have an author(tag) + * + * @return List of Documents without authors + */ + public List getUnknownDocuments() { + List unknownDocuments = new ArrayList(); + for (Document document : documents) { + if (!document.isAuthorKnown()) { + unknownDocuments.add(document); + } + } + return unknownDocuments; + } + + /** + * Get a List of Documents currently loaded into the system that have a author(tag) + * + * @return List of Documents with authors + */ + public List getKnownDocuments() { + List knownDocuments = new ArrayList(); + for (Document document : documents) { + if (document.isAuthorKnown()) { + knownDocuments.add(document); + } + } + return knownDocuments; + } + + /** + * Get a List of Documents that all have the same author(tag) + * + * @param author - the author(tag) to select documents on + * @return - List of Documents limited by the author provided + */ + public List getDocumentsByAuthor(String author) { + List authorDocuments = new ArrayList(); + for (Document document : documents) { + if (document.isAuthorKnown()) { + if (author.equalsIgnoreCase(document.getAuthor())) { + authorDocuments.add(document); + } + } + } + return authorDocuments; + } + + /** + * Get a List of all unique authors(tags) applied to Known(Training) Documents + * + * @return List of authors + */ + public List getAuthors() { + Set authors = new HashSet(); + for (Document document : documents) { + if (document.isAuthorKnown()) { + authors.add(document.getAuthor()); + } + } + List authorsList = new ArrayList(authors); + Collections.sort(authorsList); + return authorsList; + } + + /** + * Loads the documents from the file system + * @throws Exception + */ + public void loadDocuments() throws Exception{ + for(Document document : documents){ + document.load(); + } + } + + /** + * Adds the specified canonicizer to all documents currently loaded in the system. + * + * @param action - the unique string name representing a canonicizer (displayName()) + * @return - a reference to the canonicizer added + * @throws Exception - if the canonicizer specified cannot be found or instanced + */ + public Canonicizer addCanonicizer(String action) throws Exception { + Canonicizer canonicizer = Canonicizers.getCanonicizer(action); + for (Document document : documents) { + addCanonicizer(canonicizer, document); + } + return canonicizer; + } + + /** + * Adds the specified canonicizer to all Documents that have the DocType docType. + * + * @param action - the unique string name representing a canonicizer (displayName()) + * @param docType - The DocType this canonicizer is restricted to + * @return - a reference to the canonicizer added + * @throws Exception - if the canonicizer specified cannot be found or instanced + */ + public Canonicizer addCanonicizer(String action, Document.Type docType) throws Exception { + Canonicizer canonicizer = Canonicizers.getCanonicizer(action); + for (Document document : documents) { + if (document.getDocType().equals(docType)) { + addCanonicizer(canonicizer, document); + } + } + return canonicizer; + } + + /** + * Add the Canonicizer specified to the document referenced. + * + * @param action - the unique string name representing a canonicizer (displayName()) + * @param document - the Document to add the canonicizer to + * @return - a reference to the canonicizer added + * @throws Exception - if the canonicizer specified cannot be found or instanced + */ + public Canonicizer addCanonicizer(String action, Document document) + throws Exception { + Canonicizer canonicizer = Canonicizers.getCanonicizer(action); + return addCanonicizer(canonicizer, document); + } + + /** + * Add the Canonicizer specified to the document referenced. + * + * @param canonicizer - the canonicizer to add + * @param document - the Document to add the canonicizer to + * @return - a reference to the canonicizer added + */ + public Canonicizer addCanonicizer(Canonicizer canonicizer, Document document) { + document.addCanonicizer(canonicizer); + logger.info("Adding Canonicizer "+canonicizer.displayName()+" to Document "+document.toString()); + return canonicizer; + } + + public Canonicizer addCanonicizer(String action, EventDriver eventDriver) throws Exception { + Canonicizer canonicizer = Canonicizers.getCanonicizer(action); + return addCanonicizer(canonicizer, eventDriver); + } + + public Canonicizer addCanonicizer(Canonicizer canonicizer, EventDriver eventDriver) { + eventDriver.addCanonicizer(canonicizer); + logger.info("Adding Canonicizer "+canonicizer.displayName()+" to EventDriver "+eventDriver.displayName()); + return canonicizer; + } + + /** + * Removes the first instance of the canoniciser corresponding to the action(displayName()) + * from the Document referenced. + * + * @param canonicizer - the canonicizer to be removed + * @param document - a reference to the Document to remove the canonicizer from + */ + public void removeCanonicizer(Canonicizer canonicizer, Document document) { + document.removeCanonicizer(canonicizer); + } + + public void removeCanonicizer(Canonicizer canonicizer, EventDriver eventDriver) { + eventDriver.removeCanonicizer(canonicizer); + } + + /** + * Removes the first occurrence of the canonicizer corresponding to the action(displayName()) + * from every document + * + * @param canonicizer - the canonicizer to be removed + */ + public void removeCanonicizer(Canonicizer canonicizer) { + for (Document document : documents) { + removeCanonicizer(canonicizer, document); + } + } + + /** + * Removes the first occurrence of the canonicizer from every Document of the DocType docType + * + * @param canonicizer - the canonicizer to be removed + * @param docType - the DocType to remove the canonicizer from + */ + public void removeCanonicizer(Canonicizer canonicizer, Document.Type docType) { + for (Document document : documents) { + if (document.getDocType().equals(docType)) { + removeCanonicizer(canonicizer, document); + } + } + } + + /** + * Removes all canonicizers from Documents with the DocType docType + * + * @param docType - the DocType to remove canonicizers from + */ + public void removeAllCanonicizers(Document.Type docType) { + for (Document document : documents) { + document.clearCanonicizers(); + } + } + + /** + * Removes all canonicizers from All Documents loaded in the system + */ + public void removeAllCanonicizers() { + for (Document document : documents) { + document.clearCanonicizers(); + } + } + + /** + * Add an Event Driver which will be used to + * eventify(Generate a List of Events order in the sequence they are found in the document) + * all of the documents + * @param action - the identifier for the EventDriver to add (displayName()) + * @return - a reference to the added EventDriver + * @throws Exception - If the action is not found or the EventDriver cannot be instanced + */ + public EventDriver addEventDriver(String action) throws Exception { + EventDriver eventDriver = EventDrivers.getEventDriver(action); + return addEventDriver(eventDriver); + } + + /** + * Add an Event Driver which will be used to + * eventify(Generate a List of Events order in the sequence they are found in the document) + * all of the documents + * @param eventDriver - the EventDriver to add + * @return - a reference to the added EventDriver + */ + public EventDriver addEventDriver(EventDriver eventDriver) { + eventDrivers.add(eventDriver); + logger.info("Adding EventDriver "+eventDriver.displayName()); + return eventDriver; + } + + /** + * Removes the Event Driver reference from the system + * @param eventDriver - the EventDriver to be removed + * @return - true if successful false if failure + */ + public Boolean removeEventDriver(EventDriver eventDriver) { + logger.info("Removing EventDriver "+eventDriver.displayName()); + return eventDrivers.remove(eventDriver); + } + + /** + * Removes all EventDrivers from the system + */ + public void removeAllEventDrivers() { + eventDrivers.clear(); + for (Document document : documents) { + document.clearEventSets(); + } + } + + /** + * Gets a List of all EventDrivers currently loaded in the system + * @return List of All loaded EventDrivers + */ + public List getEventDrivers() { + return eventDrivers; + } + + /** + * Add an Event Culler to the system + * + * @param action - unique identifier for the event culler to add (displayName()) + * @return - a reference to the added event culler + * @throws Exception - if the EventCuller cannot be found or cannor be instanced + */ + public EventCuller addEventCuller(String action) throws Exception { + EventCuller eventCuller = EventCullers.getEventCuller(action); + eventCullers.add(eventCuller); + for(EventDriver eventDriver : eventDrivers) { + addEventCuller(eventCuller, eventDriver); + } + return eventCuller; + } + + public EventCuller addEventCuller(String action, EventDriver eventDriver) throws Exception { + EventCuller eventCuller = EventCullers.getEventCuller(action); + return addEventCuller(eventCuller, eventDriver); + } + + public EventCuller addEventCuller(EventCuller eventCuller, EventDriver eventDriver) { + eventDriver.addCuller(eventCuller); + logger.info("Adding EventCuller "+eventCuller.displayName()+" to "+eventDriver.displayName()); + return eventCuller; + } + + /** + * Remove the supplied EventCuller from the system + * + * @param eventCuller - EventCuller to be removed + * @return - true if success false if failure + */ + public Boolean removeEventCuller(EventCuller eventCuller) { + logger.info("Removing EventCuller "+eventCuller.displayName()); + eventCullers.remove(eventCuller); + for(EventDriver eventDriver : eventDrivers){ + eventDriver.removeCuller(eventCuller); + } + return true; + } + + /** + * Removes all loaded EventCullers from the system + */ + public void removeAllEventCullers() { + eventCullers.clear(); + for(EventDriver eventDriver : eventDrivers){ + eventDriver.clearCullers(); + } + } + + /** + * Get a List of all EventCullers currently loaded in the system + * @return List of EventCullers loaded + */ + public List getEventCullers() { + return eventCullers; + } + + /** + * Add an AnalysisDriver to the system as referenced by the action. + * + * @param action - the unique identifier for a AnalysisDriver (alternately a DistanceFunction) + * @return - a reference to the generated Analysis Driver + * @throws Exception - If the AnalysisDriver cannot be found or if it cannot be instanced + */ + public AnalysisDriver addAnalysisDriver(String action) throws Exception { + AnalysisDriver analysisDriver = AnalysisDrivers.getAnalysisDriver(action); + return addAnalysisDriver(analysisDriver); + } + + public AnalysisDriver addAnalysisDriver(AnalysisDriver analysisDriver) { + logger.info("Adding AnalysisDriver "+analysisDriver.displayName()); + analysisDrivers.add(analysisDriver); + return analysisDriver; + } + + /** + * Removed the passed AnalysisDriver from the system + * @param analysisDriver - reference to the AnalysisDriver to be removed + * @return True if success false if failure + */ + public Boolean removeAnalysisDriver(AnalysisDriver analysisDriver) { + logger.info("Removing AnalysisDriver "+analysisDriver.displayName()); + return analysisDrivers.remove(analysisDriver); + } + + /** + * Removes all AnalysisDrivers from the system + */ + public void removeAllAnalysisDrivers() { + analysisDrivers.clear(); + } + + /** + * Adds a DistanceFunction to the AnalysisDriver supplied. + * Only AnalysisDrivers that extend the NeighborAnalysisDriver can be used + * + * @param action - unique identifier for the DistanceFunction you want to add + * @param analysisDriver - a reference to the AnalysisDriver you want the distance added to + * @return - a reference to the generated DistanceFunction + * @throws Exception - if the AnalysisDriver does not extend NeighborAnalysisDriver or if the DistanceFunction cannot be found the DistanceFunction cannot be instanced + */ + public DistanceFunction addDistanceFunction(String action, + AnalysisDriver analysisDriver) throws Exception { + DistanceFunction distanceFunction = DistanceFunctions + .getDistanceFunction(action); + return addDistanceFunction(distanceFunction, analysisDriver); + } + + /** + * Adds a DistanceFunction to the AnalysisDriver supplied. + * Only AnalysisDrivers that extend the NeighborAnalysisDriver can be used + * + * @param distanceFunction - the DistanceFunction you want to add + * @param analysisDriver - a reference to the AnalysisDriver you want the distance added to + * @return - a reference to the generated DistanceFunction + */ + public DistanceFunction addDistanceFunction(DistanceFunction distanceFunction, AnalysisDriver analysisDriver) { + ((NeighborAnalysisDriver) analysisDriver).setDistance(distanceFunction); + return distanceFunction; + } + + /** + * @param action - unique identifier for the AnalysisDriver you want to add + * @param analysisDriver - a reference to the NonDistanceDependentAnalysisDriver you want + * the other driver added to + */ + public void addAnalysisDriverAsParamToOther(String action, NonDistanceDependentAnalysisDriver analysisDriver) + throws Exception { + analysisDriver.setAnalysisDriver(AnalysisDrivers.getAnalysisDriver(action)); + } + + /** + * Get a List of All AnalysisDrivers currently loaded on the system + * @return List of All AnalysisDrivers + */ + public List getAnalysisDrivers() { + return analysisDrivers; + } + + /** + * Get the current Language JGAAP is set to be working on + * @return + */ + public Language getLanguage(){ + return language; + } + + /** + * Set the Language that JGAAP will operate in. + * This restricts what methods are available, changes the charset that is expected when reading files, and will add any pre-processing that is needed + * @param action - the Language to operate under + * @return - a Reference to the language object selected + * @throws Exception - if the language cannot be found or cannot be instanced + */ + public Language setLanguage(String action) throws Exception { + language = Languages.getLanguage(action); + return language; + } + + /** + * Pipelines the independent aspects of loading and processing a document into separate threads + * + * Load the text from disk or the web + * Take into account any special treatment based on the language currently selected + * Place the text into canonical form using the Canonicizers + * Use the EventDrivers to transform the text into EventSets + * + * @throws Exception + */ + private void loadCanonicizeEventify() throws Exception{ + List> documentsProcessing = new ArrayList>(documents.size()); + for(final Document document : documents){ + Callable work = new Callable() { + @Override + public Document call() throws Exception { + try { + document.setLanguage(language); + document.load(); + document.processCanonicizers(); + for (EventDriver eventDriver : eventDrivers) { + char[] text = document.getText(); + for(Canonicizer canonicizer : eventDriver.getCanonicizers()){ + text = canonicizer.process(text); + } + try{ + document.addEventSet(eventDriver,eventDriver.createEventSet(text)); + } catch (EventGenerationException e) { + logger.error("Could not Eventify with "+eventDriver.displayName()+" on File:"+document.getFilePath()+" Title:"+document.getTitle(),e); + throw new Exception("Could not Eventify with "+eventDriver.displayName()+" on File:"+document.getFilePath()+" Title:"+document.getTitle(),e); + } + } + document.setText(""); + } catch (LanguageParsingException e) { + logger.fatal("Could not Parse Language: "+language.displayName()+" on File:"+document.getFilePath()+" Title:"+document.getTitle(),e); + document.failed(); + } catch (CanonicizationException e) { + logger.fatal("Could not Canonicize File: "+document.getFilePath()+" Title:"+document.getTitle(),e); + document.failed(); + } catch (Exception e) { + logger.fatal("Could not load File: "+document.getFilePath()+" Title:"+document.getTitle(),e); + document.failed(); + } + return document; + } + }; + documentsProcessing.add(executor.submit(work)); + } + + while(true){ + if(documentsProcessing.size()==0){ + break; + }else { + Iterator> documentIterator = documentsProcessing.iterator(); + while(documentIterator.hasNext()){ + Future futureDocument = documentIterator.next(); + if(futureDocument.isDone()){ + Document document = futureDocument.get(); + if(document.hasFailed()){ + throw new Exception("One or more documents could not be read / parsed / canonicized Experiment Failed"); + } + logger.info("Document: "+document.getTitle()+" has finished processing."); + documentIterator.remove(); + } + } + } + } + } + + /** + * Events are culled from EventSets across all Documents on a per EventDriver basis + * @throws EventCullingException + * @throws ExecutionException + * @throws InterruptedException + */ + private void cull() throws EventCullingException, InterruptedException, ExecutionException { + List> futureEventDrivers = new ArrayList>(); + for (EventDriver eventDriver : eventDrivers) { + if (!eventDriver.getEventCullers().isEmpty()) { + futureEventDrivers.add(executor.submit(new Culling(eventDriver))); + } + } + while(futureEventDrivers.size() != 0) { + Iterator> iterator = futureEventDrivers.iterator(); + while(iterator.hasNext()) { + Future futureEventDriver = iterator.next(); + if(futureEventDriver.isDone()){ + EventDriver eventDriver = futureEventDriver.get(); + logger.info("Finished Culling "+eventDriver.displayName()); + iterator.remove(); + } + } + } + } + + /** + * All loaded AnalysisDrivers are run over All EventSets comparing the Unknown(sample) to the Known(training) Documents. + */ + private void analyze() throws AnalyzeException { + List knownDocuments = new ArrayList(); + List unknownDocuments = new ArrayList(); + for (Document document : documents) { + if (document.isAuthorKnown()) { + knownDocuments.add(document); + } else { + unknownDocuments.add(document); + } + } + for (AnalysisDriver analysisDriver : analysisDrivers) { + List> futureDocuments = new ArrayList>(); + if(analysisDriver instanceof ValidationDriver) { //For some reason, if we unify ValidationDriver and LeaveOneOutNoDistanceDriver into the same if statement, it doesn't work. + for (Document knownDocument : knownDocuments) { + List knownDocuments2 = new ArrayList(); + for(Document knownDocument2 : knownDocuments){ +//This is messy and time-consuming, but setting knownDocuments2 = knownDocuments and then removing knownDocument from knownDocuments2 doesn't work, not sure why. + if(!knownDocument2.equals(knownDocument)) + knownDocuments2.add(knownDocument2); + } + logger.info("Training " + analysisDriver.displayName()); + analysisDriver.train(knownDocuments2); + logger.info("Finished Training "+analysisDriver.displayName()); + futureDocuments.add(executor.submit(new AnalysisWorker(knownDocument, analysisDriver))); + //await analysis to finish + while(futureDocuments.size() != 0){ + Iterator> iterator = futureDocuments.iterator(); + while(iterator.hasNext()) { + Future futureDocument = iterator.next(); + if(futureDocument.isDone()) { + iterator.remove(); + } + } + + } + } + } else if(analysisDriver instanceof LeaveOneOutNoDistanceDriver) { + if(analysisDriver.displayName().contains("Weighted Voting")) { + analysisDriver.train(knownDocuments); + for(Document knownDocument : knownDocuments) { //Weighted voting is NOT training on test here + //when the user trains weighted voting, all it does is assign the weights to the classifiers by implementing LOOCV + // Upon calling analyze, it will train its classifiers for analysis + analysisDriver.analyze(knownDocument); + } + } + else { + for (Document knownDocument : knownDocuments) { + List knownDocuments2 = new ArrayList(); + for(Document knownDocument2 : knownDocuments){ + if(!knownDocument2.equals(knownDocument)) + knownDocuments2.add(knownDocument2); + } + logger.info("Training " + analysisDriver.displayName()); + analysisDriver.train(knownDocuments2); + logger.info("Finished Training "+analysisDriver.displayName()); + futureDocuments.add(executor.submit(new AnalysisWorker(knownDocument, analysisDriver))); + //await analysis to finish + while(futureDocuments.size() != 0){ + Iterator> iterator = futureDocuments.iterator(); + while(iterator.hasNext()) { + Future futureDocument = iterator.next(); + if(futureDocument.isDone()) { + iterator.remove(); + } + } + + } + } + } + + } else if (analysisDriver instanceof WEKAAnalysisDriver){ + logger.info("Training " + analysisDriver.displayName()); + analysisDriver.train(knownDocuments); + logger.info("Finished Training "+analysisDriver.displayName()); + for (Document unknownDocument : unknownDocuments){ + logger.info("Begining Analyzing: " + unknownDocument.toString()); + unknownDocument.addResult(analysisDriver, analysisDriver.analyze(unknownDocument)); + logger.info("Finished Analyzing: "+unknownDocument.toString()); + } + } else { + logger.info("Training " + analysisDriver.displayName()); + analysisDriver.train(knownDocuments); + logger.info("Finished Training "+analysisDriver.displayName()); + for (Document unknownDocument : unknownDocuments) { + futureDocuments.add(executor.submit(new AnalysisWorker(unknownDocument, analysisDriver))); + } + //await analysis to finish + while(futureDocuments.size() != 0){ + Iterator> iterator = futureDocuments.iterator(); + while(iterator.hasNext()) { + Future futureDocument = iterator.next(); + if(futureDocument.isDone()) { + iterator.remove(); + } + } + + } + } + logger.info("Finished Analysis with "+analysisDriver.displayName()); + } + } + + /** + * Performs the canonicize eventify cull and analyze methods since a strict order has to be enforced when using them + * @throws Exception + */ + public void execute() throws Exception { + clearData(); + executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()); + loadCanonicizeEventify(); + cull(); + analyze(); + executor.shutdown(); + executor.awaitTermination(5, TimeUnit.SECONDS); + } + + /** + * Removes canonicizors from all documents + */ + public void clearCanonicizers() { + for(Document document : documents){ + document.clearCanonicizers(); + } + } + + /** + * Removes all Generated data from a run but leaves all settings untouched + */ + public void clearData() { + for(Document document : documents){ + document.clearEventSets(); + document.clearResults(); + } + } + + private class Culling implements Callable { + private EventDriver eventDriver; + private ExecutorService cullingExecutor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()); + + Culling(EventDriver eventDriver) { + this.eventDriver = eventDriver; + } + + @Override + public EventDriver call() throws Exception { + List eventSets = new ArrayList(); + for(Document document : documents){ + eventSets.add(document.getEventSet(eventDriver)); + } + for(EventCuller culler : eventDriver.getEventCullers()) { + culler.init(eventSets); + List> futureEventSets = new ArrayList>(eventSets.size()); + for(EventSet eventSet : eventSets) { + futureEventSets.add(cullingExecutor.submit(new CullerWorker(eventSet, culler))); + } + eventSets.clear(); + for(Future futureEventSet : futureEventSets) { + eventSets.add(futureEventSet.get()); + } + } + cullingExecutor.shutdown(); + for(int i = 0; i < documents.size(); i++) { + documents.get(i).addEventSet(eventDriver, eventSets.get(i)); + } + return eventDriver; + } + } + + private class CullerWorker implements Callable { + private EventSet eventSet; + private EventCuller culler; + + CullerWorker(EventSet eventSet, EventCuller culler) { + this.eventSet = eventSet; + this.culler = culler; + } + + public EventSet call() { + return culler.cull(eventSet); + } + } + + private class AnalysisWorker implements Callable { + private Document document; + private AnalysisDriver analysisDriver; + + AnalysisWorker(Document document, AnalysisDriver analysisDriver){ + this.document = document; + this.analysisDriver = analysisDriver; + } + + @Override + public Document call() throws Exception { + logger.info("Begining Analyzing: " + document.toString()); + document.addResult(analysisDriver, analysisDriver.analyze(document)); + logger.info("Finished Analyzing: "+document.toString()); + return document; + } + } +} diff --git a/src/com/jgaap/classifiers/BurrowsDelta.java b/src/com/jgaap/classifiers/BurrowsDelta.java index 0df29c9a4..7b5f3d63d 100644 --- a/src/com/jgaap/classifiers/BurrowsDelta.java +++ b/src/com/jgaap/classifiers/BurrowsDelta.java @@ -115,4 +115,4 @@ public List> analyze(Document unknown) { Collections.sort(results); return results; } -} +} \ No newline at end of file diff --git a/src/com/jgaap/classifiers/KNearestNeighborDriver.java b/src/com/jgaap/classifiers/KNearestNeighborDriver.java index 85849da32..35376993d 100644 --- a/src/com/jgaap/classifiers/KNearestNeighborDriver.java +++ b/src/com/jgaap/classifiers/KNearestNeighborDriver.java @@ -49,9 +49,12 @@ public class KNearestNeighborDriver extends NeighborAnalysisDriver { private static final int DEFAULT_K = 5; private static final String DEFAULT_TIE = "lastPicked"; - + public KNearestNeighborDriver() { - addParams("k", "K", "5", new String[] {"1","2","3","4","5","6","7","8","9","10"}, false); + addParams("k", "K: Number of Neighbors", "5", new String[] { "1", "2", + "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", + "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", + "24", "25" }, false); } public String displayName() { @@ -107,7 +110,7 @@ public List> analyze(Document unknown) throws AnalyzeExcept } List> results = ballot.getResults(); - Comparator> compareByScore = (Pair r1, Pair r2) -> r2.getSecond().compareTo(r1.getSecond()); + Comparator> compareByScore = (Pair r1, Pair r2) -> r1.getSecond().compareTo(r2.getSecond()); Collections.sort(results, compareByScore); return results; diff --git a/src/com/jgaap/classifiers/LeaveOneOutKNearestNeighborDriver.java b/src/com/jgaap/classifiers/LeaveOneOutKNearestNeighborDriver.java new file mode 100644 index 000000000..c87b42c7c --- /dev/null +++ b/src/com/jgaap/classifiers/LeaveOneOutKNearestNeighborDriver.java @@ -0,0 +1,144 @@ +/* +* JGAAP -- a graphical program for stylometric authorship attribution + * Copyright (C) 2009,2011 by Patrick Juola + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package com.jgaap.classifiers; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + + +import com.google.common.collect.ImmutableList; +import com.jgaap.generics.AnalyzeException; +import com.jgaap.generics.DistanceCalculationException; +import com.jgaap.generics.ValidationDriver; +import com.jgaap.util.Ballot; +import com.jgaap.util.Document; +import com.jgaap.util.EventMap; +import com.jgaap.util.Pair; + +/*KNN LOOCV implementation by @Alejandro Jorge Napolitano Jawerbaum*/ + +public class LeaveOneOutKNearestNeighborDriver extends ValidationDriver { + + private java.util.logging.Logger logger = java.util.logging.Logger.getLogger(LeaveOneOutKNearestNeighborDriver.class.getName()); + + private ImmutableList> knowns; + + private static final int DEFAULT_K = 5; + private static final String DEFAULT_TIE = "lastPicked"; + + public LeaveOneOutKNearestNeighborDriver() { + addParams("k", "K: Number of Neighbors", "5", new String[] { "1", "2", + "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", + "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", + "24", "25" }, false); + } + @Override + public String displayName() { + return "Leave One Out K-Nearest Neighbor driver" + this.getDistanceName(); + } + @Override + public String tooltipText() { + return " "; + } + @Override + public boolean showInGUI() { + return true; + } + @Override + public void train(List knowns){ + ImmutableList.Builder> builder = ImmutableList.builder(); + for(Document known : knowns) { + builder.add(new Pair(known, new EventMap(known))); + } + this.knowns = builder.build(); + } + + @Override + public List> analyze(Document unknown) throws AnalyzeException { + + Ballot ballot = new Ballot(); + + int k = getParameter("k", DEFAULT_K); + + String tieBreaker = getParameter("tieBreaker", DEFAULT_TIE); + + List> rawResults = new ArrayList>(); + + for (int i = 0; i < knowns.size(); i++) { + if(!knowns.get(i).getFirst().equals(unknown)) { + double current; + try { + current = distance.distance(new EventMap(unknown), knowns.get(i).getSecond()); + } catch (DistanceCalculationException e) { + throw new AnalyzeException("Distance "+distance.displayName()+" failed"); + } + rawResults.add(new Pair(knowns.get(i).getFirst().getAuthor(), current, 2)); + } + else + logger.info("Excluded document that's being tested."); + } + Collections.sort(rawResults); + for(int i = 0; i < Math.min(k, rawResults.size()); i++) { + Pair p = rawResults.get(i); + ballot.vote(p.getFirst(), (1 + Math.pow(2, (-1.0 * (i+1))))); + } + + if(tieBreaker.equals("lastPicked")) { + ballot.setComparator(new LastPickedComparator()); + } + + List> results = ballot.getResults(); + Comparator> compareByScore = (Pair r1, Pair r2) -> r2.getSecond().compareTo(r1.getSecond()); + Collections.sort(results, compareByScore); + + return results; + } + + private static class LastPickedComparator implements Comparator>, Serializable { + + private static final long serialVersionUID = 1L; + + public int compare(Pair firstPair, Pair secondPair) { + double first = firstPair.getSecond(); + double second = secondPair.getSecond(); + + // If the overall rank was not the same, then return these according to rank. + if((int)first != (int)second) { + return (int)first - (int)second; + } + + // Otherwise, we want to move the decimal point right until we have an integer. + while(((int)first - first) > 0.0000001) { + first *= 2; + second *= 2; + } + // If first had fewer decimal places than second, this means the last first vote came BEFORE the last second vote. + if(((int)second -second) > 0.0000001) { + return 1; + } + // Otherwise, the last second vote came before the last first vote. + else { + return -1; + } + } + } +} \ No newline at end of file diff --git a/src/com/jgaap/classifiers/LeaveOneOutNoDistanceDriver.java b/src/com/jgaap/classifiers/LeaveOneOutNoDistanceDriver.java index f64d8c0c7..5579467fd 100644 --- a/src/com/jgaap/classifiers/LeaveOneOutNoDistanceDriver.java +++ b/src/com/jgaap/classifiers/LeaveOneOutNoDistanceDriver.java @@ -62,12 +62,14 @@ public List> analyze(Document fakeUnknown) throws AnalyzeEx // document. We call this document a fake unknown because it is actually known, // but we want to pretend that it isn't. List knownsTemp = new ArrayList<>(); - for(Document known : knownDocuments) - if(known != fakeUnknown) + for(Document known : knownDocuments) { + if(!known.equals(fakeUnknown)) knownsTemp.add(known); - + } + // Set the analysisDriver's parameters. // Pass the temporary known list and the fake unknown to the analysis driver that this // driver depends on, and return the result. + analysisDriver.setParamGUI(getParamGUI()); analysisDriver.train(knownsTemp); return analysisDriver.analyze(fakeUnknown); } diff --git a/src/com/jgaap/classifiers/WEKALogisticRegression.java b/src/com/jgaap/classifiers/WEKALogisticRegression.java new file mode 100644 index 000000000..d5a7a2daa --- /dev/null +++ b/src/com/jgaap/classifiers/WEKALogisticRegression.java @@ -0,0 +1,35 @@ +package com.jgaap.classifiers; + +import java.util.List; + +import com.jgaap.generics.AnalyzeException; +import com.jgaap.generics.WEKAAnalysisDriver; +import com.jgaap.util.Document; + +import weka.classifiers.Classifier; + +public class WEKALogisticRegression extends WEKAAnalysisDriver { + @Override + public String displayName() { + return "WEKA Logistic Regression"; + } + + @Override + public String tooltipText() { + return "Multinomial logistic regression, Courtesy of WEKA"; + } + + @Override + public boolean showInGUI() { + return true; + } + + public Classifier getClassifier() { + return (Classifier)(new weka.classifiers.functions.Logistic()); + } + public void testRequirements(List knownList) throws AnalyzeException{ + //No requirements + return; + } + +} diff --git a/src/com/jgaap/classifiers/weightedVoting.java b/src/com/jgaap/classifiers/weightedVoting.java new file mode 100644 index 000000000..c3ee0f828 --- /dev/null +++ b/src/com/jgaap/classifiers/weightedVoting.java @@ -0,0 +1,142 @@ +package com.jgaap.classifiers; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.apache.log4j.Logger; +import com.jgaap.backend.AnalysisDrivers; +import com.jgaap.backend.DistanceFunctions; +import com.jgaap.generics.AnalysisDriver; +import com.jgaap.generics.AnalyzeException; +import com.jgaap.generics.DistanceFunction; +import com.jgaap.generics.NeighborAnalysisDriver; +import com.jgaap.generics.ValidationDriver; +import com.jgaap.util.Document; +import com.jgaap.util.Pair; +import com.jgaap.util.WeightingMethod; +/** @author Alejandro J Napolitano Jawerbaum +See tooltipText for a short description. +* weightedVoting weights algorithms' votes (prediction) according to a weighting algorithm. "None" is an option. +* Using sets instead of arraylists to user-proof it against having the same algorithm vote multiple times. +*/ +public class weightedVoting extends AnalysisDriver { + public Set classifiers = new HashSet(); + private static Set> weightedClassifiers = new HashSet>(); + private static Set> weights = new HashSet>(); + private static Set authors = new HashSet(); + private static Logger logger = Logger.getLogger(weightedVoting.class); + private static List knowns = new ArrayList(); + + public weightedVoting() { + addParams("Classifiers", "Classifiers to be put to a vote.","Comma-separated list. Add | before parameters.", new String[] {""}, true); //TODO: Get all classifiers and add them to the array, then call each of them + addParams("Distances", "Distance metrics for distance dependent Analysis Drivers","Comma-separated list", new String[] {""}, true); + addParams("WeightingMethod", "Way to weight the classifiers.", "cross-validation", new String[]{"cross-validation", "accuracyOverSum", "none"}, false); + addParams("Cutoff", "Minimum cross-validation score to consider an algorithm's vote.", "75", new String[]{"0", "10", "20","30","40","45","50","55","60","65","70","75","80","85","90","95", "100"}, true); + addParams("VotingMethod", "Voting Method.", "sum", new String[] {"sum", "sum/count"}, false); + addParams("AuthorsForCrossval", "Comma separated list of Authors to cross-validate. Empty = All.", "", new String[] {}, true); + } + + @Override + public String displayName() { + return "Weighted Voting"; + } + + @Override + public String tooltipText() { + return "Takes in a list of analysis drivers, and put them to a vote on each unknown document. Warning: We recommend including independent classifiers only."; + } + + @Override + public boolean showInGUI() { + return true; + } + + + @Override + public void train(List knownDocuments) throws AnalyzeException { + for(Document doc : knownDocuments) + authors.add(doc.getAuthor()); + knowns = knownDocuments; + Set clsfr = new HashSet(); + for(String s : getParameter("Classifiers").split(",")) { + try { + AnalysisDriver classifier = AnalysisDrivers.getAnalysisDriver(s.trim()); + if(classifier instanceof NeighborAnalysisDriver) { + NeighborAnalysisDriver classif = (NeighborAnalysisDriver)AnalysisDrivers.getAnalysisDriver(s);; + String[] distances = getParameter("Distances").split(","); + for(String distance : distances) { + DistanceFunction dist = DistanceFunctions.getDistanceFunction(distance); + classif.setDistance(dist); + clsfr.add(classif); + } + } + else if(!(classifier instanceof LeaveOneOutNoDistanceDriver) && !(classifier instanceof ValidationDriver) && !(classifier instanceof weightedVoting)) + clsfr.add(classifier); + else + logger.info("Excluded cross-validation driver. Or worse, a weighted voting inception."); + } catch (Exception e) { + e.printStackTrace(); + } + } + classifiers = clsfr; + weights = WeightingMethod.weight(classifiers, knownDocuments, getParameter("WeightingMethod"), getParameter("AuthorsForCrossval")); + Set> weighted = new HashSet>(); + if(!getParameter("Cutoff").equals("0")) { + for(Pair weight : weights) + if(weight.getSecond()>=(Double.parseDouble(getParameter("Cutoff"))/100)) + weighted.add(weight); + weightedClassifiers = weighted; + } + } + /** + * Analyzes the unknown document and tallies the weighted votes. + * @param Document unknownDocument. Pass in the document to be analyzed. + * */ + public Map vote(Document unknownDocument) throws AnalyzeException { + List> authorVote = new ArrayList>(); + + for(Pair weightedClassifier : weightedClassifiers) { + List> results = weightedClassifier.getFirst().analyze(unknownDocument); + logger.info(weightedClassifier.getFirst().displayName()+ ". weight = " + weightedClassifier.getSecond() + ". Voted for " + results.get(0).getFirst() + " for document " + unknownDocument.getTitle()); + authorVote.add(new Pair(results.get(0).getFirst(), weightedClassifier.getSecond())); + } + //We should check the results for ties, and let the score be 0 for all authors if that is the case. + Map authorVoteSumMap = new HashMap(); + for (String author : authors) { + double totalVote = 0.0; + for (Pair vote : authorVote) { + if (vote.getFirst().contains(author)) { + totalVote += vote.getSecond(); + } + } + if(!authorVoteSumMap.containsKey(author)) + authorVoteSumMap.put(author, totalVote); + + } + logger.info(authorVoteSumMap); + return authorVoteSumMap; + } + + @Override + public List> analyze(Document unknownDocument) throws AnalyzeException { + for(Pair weightedClassifier : weightedClassifiers) { + logger.info("Training " + weightedClassifier.getFirst().displayName() + " for analysis"); + weightedClassifier.getFirst().train(knowns); + logger.info("Finished training " + weightedClassifier.getFirst().displayName() + " for analysis"); + } + Map authorVoteSumMap = vote(unknownDocument); + Comparator> compareByScore = (Pair r1, Pair r2) -> r2.getSecond().compareTo(r1.getSecond()); + List> authorVoteSum = new ArrayList>(); + for(String author : authors) + authorVoteSum.add(new Pair(author,authorVoteSumMap.get(author))); + + Collections.sort(authorVoteSum, compareByScore); + //Collections.reverse(authorVoteSum); + return authorVoteSum; + } +} diff --git a/src/com/jgaap/util/WeightingMethod.java b/src/com/jgaap/util/WeightingMethod.java new file mode 100644 index 000000000..981d0c4db --- /dev/null +++ b/src/com/jgaap/util/WeightingMethod.java @@ -0,0 +1,95 @@ +package com.jgaap.util; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.log4j.Logger; + +import com.jgaap.generics.AnalysisDriver; +import com.jgaap.generics.AnalyzeException; +/** + * @author Alejandro J Napolitano Jawerbaum + * This class provides support for weightedVoting and other algorithms that put AnalysisDrivers to a vote by weighting said votes. + */ +public class WeightingMethod { + private static Logger logger = Logger.getLogger(WeightingMethod.class); + /** + * @param Set classifiers. A set of AnalysisDrivers. + * @param Set knownDocuments. These will be used for cross-validation + * @param String method. Name of the weighted algorithm. + * @param String authors. The authors to cross-validate for. An empty string means all authors will be cross-validated. + * @return Set> So as to prevent having duplicate AnalysisDrivers. + * This algorithm simply takes in instructions and passes them onto the appropriate weighting method. + * Doing this because, in the case of suspected and distractor authors, a user may wish to only take into consideration how accurate an algorithm is at differentiating the suspected authors from each other and the distractor authors. + */ + public static Set> weight(Set classifiers, List knownDocuments, String method, String authors) throws AnalyzeException{ + if(method.equalsIgnoreCase("cross-validation")) + return weightByCrossVal(classifiers, knownDocuments, authors); + else if(method.equalsIgnoreCase("accuracyOverSum")) + return weightByAccuracyOverSum(classifiers, knownDocuments, authors); + else + { + Set> unweightedClassifiers = new HashSet>(); + for(AnalysisDriver classifier : classifiers) + unweightedClassifiers.add(new Pair(classifier, 1.0)); + return unweightedClassifiers; + } + + } + /** + * @param Set classifiers. A set of AnalysisDrivers. + * @param Set knownDocuments. These will be used for cross-validation + * @param String authors. The authors to cross-validate for. An empty string means all authors will be cross-validated. + * This algorithm weights by raw LOOCV score. + */ + public static Set> weightByCrossVal(Set classifiers, List knownDocuments, String authors) throws AnalyzeException{//This will be expanded, but for now it weights by LOOCV score. + Set> weights = new HashSet>(); + + for(AnalysisDriver classifier : classifiers) { + Double analysesCounter = 0.0; + Double score = 0.0; + for (Document knownDocument : knownDocuments) { + if(authors.contains(knownDocument.getAuthor()) || authors.equals("")){ + List knownDocuments2 = new ArrayList(); + for(Document knownDocument2 : knownDocuments){ + if(!knownDocument2.equals(knownDocument)) + knownDocuments2.add(knownDocument2); + } + logger.info("Training " + classifier.displayName() +" for cross-validation"); + classifier.train(knownDocuments2); + logger.info("Finished Training "+classifier.displayName() + " for cross-validation"); + logger.info("Begining Analyzing: " + knownDocument.toString() + " for cross-validation"); + List> results = classifier.analyze(knownDocument); + logger.info("Finished Analyzing: "+ knownDocument.toString() + " for cross-validation"); + Pair result = results.get(0); + analysesCounter++; + if(result.getFirst().contains(knownDocument.getAuthor())) + score++; + } + } + weights.add(new Pair(classifier, score/analysesCounter)); + } + return weights; + } + /** + * @param Set classifiers. A set of AnalysisDrivers. + * @param Set knownDocuments. These will be used for cross-validation + * @param String authors. The authors to cross-validate for. An empty string means all authors will be cross-validated. + * This algorithm weights by raw LOOCV score divided by the total sum of weights. + */ + public static Set> weightByAccuracyOverSum(Set classifiers, List knownDocuments, String authors) throws AnalyzeException{ + Set> weights = weightByCrossVal(classifiers, knownDocuments, authors); + Set> weights2 = new HashSet>(); + Double sum = 0.0; + for(Pair weight : weights) + sum+=weight.getSecond(); + for(Pair weight : weights) + weights2.add(new Pair(weight.getFirst(), weight.getSecond()/sum)); + return weights2; + + + } + +}