Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions src/com/jgaap/canonicizers/ConvertCurlyQuotes.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* JGAAP -- a graphical program for stylometric authorship attribution
* Copyright (C) 2009,2011 by Patrick Juola
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package com.jgaap.canonicizers;

import com.jgaap.generics.Canonicizer;

/**
* Converts Typographical quotes to the standard straight quotes in an attempt to standardize all
* quotation marks.
*/
public class ConvertCurlyQuotes extends Canonicizer{
@Override
public String displayName() {
return "Convert Curly Quotes";
}

@Override
public String tooltipText() {
return "Converts Curly Quotes (Smart Quotes) into nonslanted marks to standardize them.";
}

@Override
public String longDescription() {
return "Converts Curly Quotes (Smart Quotes) into nonslanted marks to standardize them. Curly Quotes typically in Word and PDF Documents.";
}

@Override
public boolean showInGUI() {
return true;
}

/**
* Strip punctuation from input characters
*
* @param procText
* array of characters to be processed.
* @return array of processed characters.
*/
@Override
public char[] process(char[] procText) {
try{
char curly_double1 = '\u201d';
char curly_double2 = '\u201c';
char curly_single1 = '\u2018';
char curly_single2 = '\u2019';
char target_single = '\'';
for(int i = 0; i < procText.length; i++){
//check for "Curly" quotes
if(procText[i] == curly_double1 || procText[i] == curly_double2){
procText[i] = '\u0022';
}
else if(procText[i] == curly_single1 || procText[i] == curly_single2){
procText[i] = target_single;
}

}
return procText;
}catch (Exception e){
return procText;
}


}
}
4 changes: 3 additions & 1 deletion src/com/jgaap/resources/abbreviation.list
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ Dr
Miss
Mr
Mrs
etc
etc


79 changes: 79 additions & 0 deletions unittests/com/jgaap/canonicizers/ConvertCurlyQuotesTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* JGAAP -- a graphical program for stylometric authorship attribution
* Copyright (C) 2009,2011 by Patrick Juola
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/**
*
*/

package com.jgaap.canonicizers;

import static org.junit.Assert.*;

import java.nio.charset.Charset;
import java.util.Arrays;

import org.junit.Test;

public class ConvertCurlyQuotesTest {

/**
* Test method for
* {@link com.jgaap.canonicizers.ConvertCurlyQuotes#process(char[])}
* .
*/

@Test
public void testProcess(){
String example = "Here\u2019s a sentence with curly single quotes: \u2018Hello!\u2019 and straight single quotes: 'World'. " +
"Also, curly double quotes: \u201CThis is a test.\u201D and straight double quotes: \"Another test.\" " +
"Let\u2019s see how they appear when printed!";
char[] sample = example.toCharArray();
String expected_text = "Here's a sentence with curly single quotes: 'Hello!' and straight single quotes: 'World'. " +
"Also, curly double quotes: \"This is a test.\" and straight double quotes: \"Another test.\" " +
"Let's see how they appear when printed!";
char[] expected = expected_text.toCharArray();
ConvertCurlyQuotes canon = new ConvertCurlyQuotes();

String[] encodings_that_support_curly_quotes = {
"GB2312",
"UTF-8",
"UTF-16"
};

for (String encoding : encodings_that_support_curly_quotes){
try{
// Convert the string to a byte array using the specified encoding
byte[] encodedBytes = example.getBytes(Charset.forName(encoding));

// Decode the byte array back into a string using the same encoding
String decodedText = new String(encodedBytes, Charset.forName(encoding));

// Convert the decoded text into a char array
char[] procText = decodedText.toCharArray();
char[] result = canon.process(procText);
assertTrue(Arrays.equals(expected, result));

}catch(Exception e){
System.out.println("Encoding conversion did not work for " + encoding);
}
}

}



}