diff --git a/src/com/jgaap/canonicizers/ConvertCurlyQuotes.java b/src/com/jgaap/canonicizers/ConvertCurlyQuotes.java new file mode 100644 index 000000000..e1c30361e --- /dev/null +++ b/src/com/jgaap/canonicizers/ConvertCurlyQuotes.java @@ -0,0 +1,80 @@ +/* + * JGAAP -- a graphical program for stylometric authorship attribution + * Copyright (C) 2009,2011 by Patrick Juola + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package com.jgaap.canonicizers; + +import com.jgaap.generics.Canonicizer; + +/** + * Converts Typographical quotes to the standard straight quotes in an attempt to standardize all + * quotation marks. + */ +public class ConvertCurlyQuotes extends Canonicizer{ + @Override + public String displayName() { + return "Convert Curly Quotes"; + } + + @Override + public String tooltipText() { + return "Converts Curly Quotes (Smart Quotes) into nonslanted marks to standardize them."; + } + + @Override + public String longDescription() { + return "Converts Curly Quotes (Smart Quotes) into nonslanted marks to standardize them. Curly Quotes typically in Word and PDF Documents."; + } + + @Override + public boolean showInGUI() { + return true; + } + + /** + * Strip punctuation from input characters + * + * @param procText + * array of characters to be processed. + * @return array of processed characters. + */ + @Override + public char[] process(char[] procText) { + try{ + char curly_double1 = '\u201d'; + char curly_double2 = '\u201c'; + char curly_single1 = '\u2018'; + char curly_single2 = '\u2019'; + char target_single = '\''; + for(int i = 0; i < procText.length; i++){ + //check for "Curly" quotes + if(procText[i] == curly_double1 || procText[i] == curly_double2){ + procText[i] = '\u0022'; + } + else if(procText[i] == curly_single1 || procText[i] == curly_single2){ + procText[i] = target_single; + } + + } + return procText; + }catch (Exception e){ + return procText; + } + + + } +} diff --git a/src/com/jgaap/resources/abbreviation.list b/src/com/jgaap/resources/abbreviation.list index b7470ea73..8b09a7854 100644 --- a/src/com/jgaap/resources/abbreviation.list +++ b/src/com/jgaap/resources/abbreviation.list @@ -2,4 +2,6 @@ Dr Miss Mr Mrs -etc \ No newline at end of file +etc + + diff --git a/unittests/com/jgaap/canonicizers/ConvertCurlyQuotesTest.java b/unittests/com/jgaap/canonicizers/ConvertCurlyQuotesTest.java new file mode 100644 index 000000000..c28cb3caa --- /dev/null +++ b/unittests/com/jgaap/canonicizers/ConvertCurlyQuotesTest.java @@ -0,0 +1,79 @@ +/* + * JGAAP -- a graphical program for stylometric authorship attribution + * Copyright (C) 2009,2011 by Patrick Juola + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +/** + * + */ + +package com.jgaap.canonicizers; + +import static org.junit.Assert.*; + +import java.nio.charset.Charset; +import java.util.Arrays; + +import org.junit.Test; + +public class ConvertCurlyQuotesTest { + + /** + * Test method for + * {@link com.jgaap.canonicizers.ConvertCurlyQuotes#process(char[])} + * . + */ + + @Test + public void testProcess(){ + String example = "Here\u2019s a sentence with curly single quotes: \u2018Hello!\u2019 and straight single quotes: 'World'. " + + "Also, curly double quotes: \u201CThis is a test.\u201D and straight double quotes: \"Another test.\" " + + "Let\u2019s see how they appear when printed!"; + char[] sample = example.toCharArray(); + String expected_text = "Here's a sentence with curly single quotes: 'Hello!' and straight single quotes: 'World'. " + + "Also, curly double quotes: \"This is a test.\" and straight double quotes: \"Another test.\" " + + "Let's see how they appear when printed!"; + char[] expected = expected_text.toCharArray(); + ConvertCurlyQuotes canon = new ConvertCurlyQuotes(); + + String[] encodings_that_support_curly_quotes = { + "GB2312", + "UTF-8", + "UTF-16" + }; + + for (String encoding : encodings_that_support_curly_quotes){ + try{ + // Convert the string to a byte array using the specified encoding + byte[] encodedBytes = example.getBytes(Charset.forName(encoding)); + + // Decode the byte array back into a string using the same encoding + String decodedText = new String(encodedBytes, Charset.forName(encoding)); + + // Convert the decoded text into a char array + char[] procText = decodedText.toCharArray(); + char[] result = canon.process(procText); + assertTrue(Arrays.equals(expected, result)); + + }catch(Exception e){ + System.out.println("Encoding conversion did not work for " + encoding); + } + } + + } + + + +}