-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathsafetext.py
94 lines (79 loc) · 3.85 KB
/
safetext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# -- coding: utf-8 --
# SafeText
# This script takes an input of a text file, and removes any characters that could be unique identifiers that would
# reveal an otherwise confidential source.
# Inspiration https://www.zachaysan.com/writing/2017-12-30-zero-width-characters
# David Jacobson
import argparse
import sys
from characters_safetext import HOMOGLYPHS, ZERO_WIDTH_CHARS, NON_STANDARD_SPACES
from unicodedata import normalize
def underline(chars):
if WIN:
return "'{}'".format(chars)
return '\033[4m' + chars + '\033[0m'
def safe_windows_print(s):
try:
print(s)
except UnicodeEncodeError as e:
for c in s:
try:
print(c, end='')
except UnicodeEncodeError as d:
print('?', end='')
COUNTRY_SMELLS = {
"BRIT_SPELLS": [],
"US_SPELLS": [],
}
for SPELL_FILE in ["BRIT_SPELLS", "US_SPELLS"]:
with open(SPELL_FILE, "r") as WORD_FILE:
lines = WORD_FILE.readlines()
for each in lines:
COUNTRY_SMELLS[SPELL_FILE].append(each.strip())
parser = argparse.ArgumentParser(description="Clean a text file of any identifying Unicode characters")
parser.add_argument('input', metavar='I', help='File to be cleaned')
args = parser.parse_args()
out_file_name = args.input + ".safe"
print("[*] Cleaning {} to {} ...".format(args.input, out_file_name))
# get platform once
WIN = sys.platform == 'win32'
with open(args.input, mode="r", encoding="UTF-8") as in_file: # File to process
lines = in_file.readlines() # Read the lines into memory
for index, line in enumerate(lines): # Use enum so we can keep track of the lines
line_to_display = line # This will be the line presented to the user, to highlight what characters were hidden
# print(normalize("NFKC", line))
for character in ZERO_WIDTH_CHARS: # Checking starts here
if ZERO_WIDTH_CHARS[character] in line:
display_line = True
print("[!] FOUND a {} ON LINE # {}".format(character, index+1)) # +1 so it's human readable
line_to_display = line_to_display.replace(ZERO_WIDTH_CHARS[character], "*")
line = line.replace(ZERO_WIDTH_CHARS[character], "") # To actually remove
for letter in HOMOGLYPHS:
if HOMOGLYPHS[letter] in line:
display_line = True
# print(underline(HOMOGLYPHS[character]))
print("[!] FOUND HOMOGLYPHIC CHARACTER {} ON LINE {}".format(letter, index+1))
line_to_display = line_to_display.replace(HOMOGLYPHS[letter], underline(HOMOGLYPHS[letter]))
replacement_char = letter[-1]
line = line.replace(HOMOGLYPHS[letter], replacement_char)
for space in NON_STANDARD_SPACES:
if NON_STANDARD_SPACES[space] in line:
print("[!] FOUND A NON STANDARD SPACE ON LINE {}".format(index+1))
line_to_display = line_to_display.replace(NON_STANDARD_SPACES[space], "*") # Highlight for display
line = line.replace(NON_STANDARD_SPACES[space], " ") # Normalize
if line_to_display != line: # If the line had to be modified, print it.
if WIN:
safe_windows_print(line_to_display.strip())
else:
print(line_to_display.strip())
for SPELLINGS in COUNTRY_SMELLS:
for word in COUNTRY_SMELLS[SPELLINGS]:
if word in line.lower(): # Normalize
print("[!] WARNING - Use of spelling ({}) that identifies country on line {}".format(word, index+1))
print("[*] GUESS OF COUNTRY: {}".format(SPELLINGS.split("_")[0]))
lines[index] = line # Update
out_file = open(out_file_name, mode="w", encoding="UTF-8")
for line in lines:
out_file.write(line)
out_file.close()
print("[*] Output file closed")