diff --git a/backend/config_files/capabilities_template b/backend/config_files/capabilities_template index cb49f94..54b43ec 100644 --- a/backend/config_files/capabilities_template +++ b/backend/config_files/capabilities_template @@ -73,5 +73,47 @@ "path": "john-local.conf", "sha1": "", "last_change": null - } + }, + + "dewiki.txt": { + "type": "dict", + "path": "dict/dewiki.txt", + "sha1": "", + "last_change": null + }, + + "frwiki.txt": { + "type": "dict", + "path": "dict/frwiki.txt", + "sha1": "", + "last_change": null + }, + + "eswiki.txt": { + "type": "dict", + "path": "dict/eswiki.txt", + "sha1": "", + "last_change": null + }, + + "titles-dewiki.txt": { + "type": "dict", + "path": "dict/titles-dewiki.txt", + "sha1": "", + "last_change": null + }, + + "titles-frwiki.txt": { + "type": "dict", + "path": "dict/titles-frwiki.txt", + "sha1": "", + "last_change": null + }, + + "titles-eswiki.txt": { + "type": "dict", + "path": "dict/titles-eswiki.txt", + "sha1": "", + "last_change": null + } } diff --git a/backend/helper_database.py b/backend/helper_database.py index 95846a1..eec0c87 100644 --- a/backend/helper_database.py +++ b/backend/helper_database.py @@ -47,7 +47,6 @@ def get_unique_id(): with open("dump_cristi.json", "w") as fd: json.dump(lst, fd, indent=4) - print("muie") sys.exit(0) # Treat simple fields @@ -58,6 +57,9 @@ def get_unique_id(): if "reserved" not in clean_entry: clean_entry["reserved"] = None + if "languages" not in clean_entry: + clean_entry["languages"] = None + if "user" in clean_entry: del clean_entry["user"] clean_entry["users"] = [entry["user"]] diff --git a/backend/rules b/backend/rules index 20f6f97..8bdff17 100644 --- a/backend/rules +++ b/backend/rules @@ -1,376 +1,891 @@ -[{ - "name": "hot_words", - "type": "scrambler", - "priority": 5, - "path": "", - "wordsize": 0, - "desc": "John the ripper Jumbo rules applied on some hot words (wifi/password/parola) and on variations of the essid", - "examples": ["password123", "parola123", "123", ""], - "reqs": ["john", "hashcat"], - "link": "" - }, - { - "name": "top4800", - "type": "wordlist", - "priority": 10, - "path": "wordlist-top4800-probable.txt", - "wordsize": 4800, - "desc": "Top 4800 wpa2 passwords taken from SecLists", - "examples": ["password", "12345678", "abcd1234", "q1w2e3r4"], - "reqs": ["hashcat", "wordlist-top4800-probable.txt"], - "link": "/dict?dict=wordlist-top4800-probable.txt" - }, - { - "name": "dic_lc_rom", - "type": "wordlist", - "priority": 20, - "path": "dic_lc_rom.txt", - "wordsize": 72743, - "desc": "A Romanian wordlist containing 72K words", - "examples": ["pirat", "pirata", "piratare", "piraterie", "girafa", "stanga"], - "reqs": ["hashcat", "dic_lc_rom.txt"], - "link": "/dict?dict=dic_lc_rom.txt" - }, - { - "name": "nume_bac2018", - "type": "wordlist", - "priority": 22, - "path": "nume_bac2018.txt", - "wordsize": 33666, - "desc": "A list comprised of 33K names found in romania sorted by popularity", - "examples": ["maria", "andrei", "cristian", "cristi"], - "reqs": ["hashcat", "nume_bac2018.txt"], - "link": "/dict?dict=nume_bac2018.txt" - }, - { - "name": "dic_lc_eng", - "type": "wordlist", - "priority": 24, - "path": "dic_lc_eng.txt", - "wordsize": 63769, - "desc": "An English wordlist containing 63K words. Curated from top100K wiktionary words.", - "examples": ["something", "enough", "woman", "house", "night"], - "reqs": ["hashcat", "dic_lc_eng.txt"], - "link": "/dict?dict=dic_lc_eng.txt" - }, - { - "name": "Simple123_rom", - "type": "john", - "priority": 30, - "path": "dic_lc_rom.txt", - "rule": "Simple123", - "wordsize": 1289424, - "desc": "Number (1, 12, 21, 123, 321) appending, prepending and capitalization on Romanian wordlist", - "examples": ["bautura123", "Bautura123", "Bautura1", "123bautura", "12Bautura"], - "reqs": ["hashcat", "john", "dic_lc_rom.txt", "john-local.conf"], - "link": "" - }, - { - "name": "Simple123_bac", - "type": "john", - "priority": 32, - "path": "nume_bac2018.txt", - "rule": "Simple123", - "wordsize": 521188, - "desc": "Number (1, 12, 21, 123, 321) appending, prepending and capitalization on Romanian names", - "examples": ["gabriela123", "Gabriela123", "Gabriela1", "123gabriela", "12Gabriela"], - "reqs": ["hashcat", "john", "nume_bac2018.txt", "john-local.conf"], - "link": "" - }, - { - "name": "Simple123_eng", - "type": "john", - "priority": 34, - "path": "dic_lc_eng.txt", - "rule": "Simple123", - "wordsize": 999964, - "desc": "Number (1, 12, 21, 123, 321) appending, prepending and capitalization on English words", - "examples": ["something123", "Something123", "Something1", "123something", "12Something"], - "reqs": ["hashcat", "john", "dic_lc_eng.txt", "john-local.conf"], - "link": "" - }, - { - "name": "hibpv4_top10mil", - "type": "wordlist", - "priority": 38, - "path": "top_10mil_hibp.txt", - "wordsize": 10000000, - "desc": "Top 10 mil passwords in frequency from hibp v4", - "examples": ["love4ever", "ilikepie", "candy101", "nietzsche"], - "reqs": ["hashcat", "top_10mil_hibp.txt"], - "link": "" - }, - { - "name": "SimpleNums_rom", - "type": "john", - "priority": 40, - "path": "dic_lc_rom.txt", - "rule": "SimpleNums", - "wordsize": 57496000, - "desc": "Number ([0-9], [0-9][0-9]) appending, prepending and capitalization on Romanian words. Does not overlap with Simple123_rom.", - "examples": ["bautura64", "Bautura13", "Bautura18", "69bautura", "42Bautura"], - "reqs": ["hashcat", "john", "dic_lc_rom.txt", "john-local.conf"], - "link": "" - }, - { - "name": "SimpleNums_bac", - "type": "john", - "priority": 42, - "path": "nume_bac2018.txt", - "rule": "SimpleNums", - "wordsize": 10567244, - "desc": "Number ([0-9], [0-9][0-9]) appending, prepending and capitalization on Romanian names. Does not overlap with Simple123_bac.", - "examples": ["gabriela64", "Gabriela13", "Gabriela18", "69gabriela", "42Gabriela"], - "reqs": ["hashcat", "john", "nume_bac2018.txt", "john-local.conf"], - "link": "" - }, - { - "name": "SimpleNums_eng", - "type": "john", - "priority": 44, - "path": "dic_lc_eng.txt", - "rule": "SimpleNums", - "wordsize": 20527108, - "desc": "Number ([0-9], [0-9][0-9]) appending, prepending and capitalization on English words. Does not overlap with Simple123_eng.", - "examples": ["something64", "Something13", "Something18", "69something", "42Something"], - "reqs": ["hashcat", "john", "dic_lc_eng.txt", "john-local.conf"], - "link": "" - }, - { - "name": "close_dates_sep", - "type": "generated", - "priority": 50, - "path": "date_generator.py", - "command": " 1900 2100 -./\\:_ yes", - "wordsize": 1862328, - "desc": "All dates between years 1900-2100 using formats (DDsMMsYYYY, MMsDDsYYYY, YYYYsMMsDD, DDsMMsYY, MMsDDsYY, YYsMMsDD) s='-./\\:_'", - "examples": ["17-10-90", "05/31/2005", "10-12-1901", "2099:12:12"], - "reqs": ["hashcat", "date_generator.py"], - "link": "" - }, - { - "name": "Jumbo_bac", - "type": "john", - "priority": 55, - "path": "nume_comune_bac2018.txt", - "rule": "Jumbo", - "wordsize": 13795144, - "desc": "John the ripper Jumbo rules applied on more popular romanian names", - "examples": [], - "reqs": ["hashcat", "john", "nume_comune_bac2018.txt", "john-local.conf"], - "link": "/dict?dict=nume_comune_bac2018.txt" - }, - { - "name": "Jumbo_eng", - "type": "john", - "priority": 57, - "path": "top_2500_engl.txt", - "rule": "Jumbo", - "wordsize": 13810205, - "desc": "John the ripper Jumbo rules applied on top 2500 English words (at least 4 letters)", - "examples": [], - "reqs": ["hashcat", "john", "top_2500_engl.txt", "john-local.conf"], - "link": "/dict?dict=top_2500_engl.txt" - }, - { - "name": "RugeoSamona", - "type": "john", - "priority": 58, - "path": "nume_bac2018.txt", - "rule": "RugeoSamona", - "wordsize": 28707702, - "desc": "Custom rules that involve mostly blowjobs", - "examples": ["PuieMonta", "SugeoRamona", "MuieAndrei", "muieana1"], - "reqs": ["hashcat", "john", "nume_bac2018.txt", "john-local.conf"], - "link": "" - }, - { - "name": "historic_dates_sep", - "type": "generated", - "priority": 60, - "path": "date_generator.py", - "command": " 0 1900 -./\\:_", - "wordsize": 12300600, - "desc": "All dates between years 0-1900 using formats (DDsMMsYYYY, MMsDDsYYYY, YYYYsMMsDD) s='-./\\:_'", - "examples": ["17-10-1500", "05/31/1100", "10-12-0020", "1800:12:12"], - "reqs": ["hashcat", "date_generator.py"], - "link": "" - }, - { - "name": "SimpleSymbol_rom", - "type": "john", - "priority": 70, - "path": "dic_lc_rom.txt", - "rule": "SimpleSymbol", - "wordsize": 7049216, - "desc": "Symbol appending, prepending and capitalization on Romanian words", - "examples": ["bautura*", "Bautura(", "&bautura", ":Bautura"], - "reqs": ["hashcat", "john", "dic_lc_rom.txt", "john-local.conf"], - "link": "" - }, - { - "name": "SimpleSymbol_bac", - "type": "john", - "priority": 72, - "path": "nume_bac2018.txt", - "rule": "SimpleSymbol", - "wordsize": 2280576, - "desc": "Symbol appending, prepending and capitalization on Romanian names", - "examples": ["gabriela*", "Gabriela,", "+gabriela", "+Gabriela"], - "reqs": ["hashcat", "john", "nume_bac2018.txt", "john-local.conf"], - "link": "" - }, - { - "name": "SimpleSymbol_eng", - "type": "john", - "priority": 74, - "path": "dic_lc_eng.txt", - "rule": "SimpleSymbol", - "wordsize": 4889472, - "desc": "Symbol appending, prepending and capitalization on English words", - "examples": ["something*", "Something*", "Something-", ".Something"], - "reqs": ["hashcat", "john", "dic_lc_eng.txt", "john-local.conf"], - "link": "" - }, - { - "name": "SimpleYears_rom", - "type": "john", - "priority": 80, - "path": "dic_lc_rom.txt", - "rule": "SimpleYears", - "wordsize": 57496000, - "desc": "Recent years (19[0-9][0-9], 20[0-9][0-9]) appending, prepending and capitalization on Romanian words", - "examples": ["bautura1993", "Bautura1943", "2001bautura", "2099Bautura"], - "reqs": ["hashcat", "john", "dic_lc_rom.txt", "john-local.conf"], - "link": "" - }, - { - "name": "SimpleYears_bac", - "type": "john", - "priority": 82, - "path": "nume_bac2018.txt", - "rule": "SimpleYears", - "wordsize": 26488000, - "desc": "Recent years (19[0-9][0-9], 20[0-9][0-9]) appending, prepending and capitalization on Romanian names", - "examples": ["gabriela1947", "Gabriela1919", "Gabriela2050", "2019Gabriela"], - "reqs": ["hashcat", "john", "nume_bac2018.txt", "john-local.conf"], - "link": "" - }, - { - "name": "SimpleYears_eng", - "type": "john", - "priority": 84, - "path": "dic_lc_eng.txt", - "rule": "SimpleYears", - "wordsize": 49391200, - "desc": "Recent years (19[0-9][0-9], 20[0-9][0-9]) appending, prepending and capitalization on English words", - "examples": ["something2001", "Something2006", "Something1992", "2011something"], - "reqs": ["hashcat", "john", "dic_lc_eng.txt", "john-local.conf"], - "link": "" - }, - { - "name": "future_dates_sep", - "type": "generated", - "priority": 90, - "path": "date_generator.py", - "command": " 2100 10000 -./\\:_", - "wordsize": 51144600, - "desc": "All dates between years 2100-9999 using formats (DDsMMsYYYY, MMsDDsYYYY, YYYYsMMsDD) s='-./\\:_'", - "examples": ["17-10-3001", "05/31/6050", "10-12-3030", "4242:12:12"], - "reqs": ["hashcat", "date_generator.py"], - "link": "" - }, - { - "name": "EightDigits", - "type": "mask_hashcat", - "priority": 100, - "path": "", - "mask_hashcat": "?d?d?d?d?d?d?d?d", - "wordsize": 100000000, - "desc": "All eight digits numbers", - "examples": ["17592044", "62540891", "77554310", "12385977"], - "reqs": ["hashcat"], - "link": "" - }, - { - "name": "RoPhone", - "type": "mask_hashcat", - "priority": 102, - "path": "", - "mask_hashcat": "07?d?d?d?d?d?d?d?d", - "wordsize": 100000000, - "desc": "All 10 digits phone numbers starting with '07'", - "examples": ["0745192433", "0723182955", "0744103020", "0712385977"], - "reqs": ["hashcat"], - "link": "" - }, - { - "name": "ThreeYears", - "type": "filemask_hashcat", - "priority": 108, - "path": "three_years.hcmask", - "wordsize": 8000000, - "desc": "Concatenate three times the years from 1900 to 2099 without separators", - "examples": ["197519751975", "200320061990", "209919002099", "200220022002"], - "reqs": ["hashcat", "three_years.hcmask"], - "link": "" - }, - { - "name": "hibpv4_10-100mil", - "type": "wordlist", - "priority": 110, - "path": "top_10-100mil_hibp.txt", - "wordsize": 90000000, - "desc": "Passwords from 10 to 100 mil in frequency from hibp v4", - "examples": ["horcrux9", "hehemanh", "dischead", "zvezdec123"], - "reqs": ["hashcat", "top_10-100mil_hibp.txt"], - "link": "" - }, - { - "name": "hibpv4_rest", - "type": "wordlist", - "priority": 115, - "path": "rest_hibp.txt", - "wordsize": 335309807, - "desc": "The rest of the passwords from hibp v4", - "examples": ["zucadume", "z1280136311", "shania1357", "minhuck16"], - "reqs": ["hashcat", "rest_hibp.txt"], - "link": "" - }, - { - "name": "SimpleSymbolNum_rom", - "type": "john", - "priority": 120, - "path": "dic_lc_rom.txt", - "rule": "SimpleSymbolNum", - "wordsize": 163873280, - "desc": "Symbol and number (both orders) appending, prepending and capitalization on Romanian words", - "examples": ["bautura6*", "Bautura(6", "Bautura3)", "1&bautura", ":3Bautura"], - "reqs": ["hashcat", "john", "dic_lc_rom.txt", "john-local.conf"], - "link": "" - }, - { - "name": "SimpleSymbolNum_bac", - "type": "john", - "priority": 122, - "path": "nume_bac2018.txt", - "rule": "SimpleSymbolNum", - "wordsize": 64821760, - "desc": "Symbol and number (both orders) appending, prepending and capitalization on Romanian names", - "examples": ["gabriela*1", "Gabriela/5", "+5gabriela", "9+Gabriela"], - "reqs": ["hashcat", "john", "nume_bac2018.txt", "john-local.conf"], - "link": "" - }, - { - "name": "SimpleSymbolNum_eng", - "type": "john", - "priority": 124, - "path": "dic_lc_eng.txt", - "rule": "SimpleSymbolNum", - "wordsize": 125073920, - "desc": "Symbol and number (both orders) appending, prepending and capitalization on English words", - "examples": ["something1*", "Something(8", "7*something", "<3Something"], - "reqs": ["hashcat", "john", "dic_lc_eng.txt", "john-local.conf"], - "link": "" - } +[ + { + "name": "hot_words", + "type": "scrambler", + "priority": 5, + "path": "", + "wordsize": 0, + "languages": [ + "none" + ], + "desc": "John the ripper Jumbo rules applied on some hot words (wifi/password/parola) and on variations of the essid", + "examples": [ + "password123", + "parola123", + "123", + "" + ], + "reqs": [ + "john", + "hashcat" + ], + "link": "" + }, + { + "name": "top4800", + "type": "wordlist", + "priority": 10, + "path": "wordlist-top4800-probable.txt", + "wordsize": 4800, + "languages": [ + "none" + ], + "desc": "Top 4800 wpa2 passwords taken from SecLists", + "examples": [ + "password", + "12345678", + "abcd1234", + "q1w2e3r4" + ], + "reqs": [ + "hashcat", + "wordlist-top4800-probable.txt" + ], + "link": "/dict?dict=wordlist-top4800-probable.txt" + }, + { + "name": "dic_lc_rom", + "type": "wordlist", + "priority": 20, + "path": "dic_lc_rom.txt", + "wordsize": 72743, + "languages": [ + "romanian" + ], + "desc": "A Romanian wordlist containing 72K words", + "examples": [ + "pirat", + "pirata", + "piratare", + "piraterie", + "girafa", + "stanga" + ], + "reqs": [ + "hashcat", + "dic_lc_rom.txt" + ], + "link": "/dict?dict=dic_lc_rom.txt" + }, + { + "name": "nume_bac2018", + "type": "wordlist", + "priority": 22, + "path": "nume_bac2018.txt", + "wordsize": 33666, + "languages": [ + "romanian" + ], + "desc": "A list comprised of 33K names found in romania sorted by popularity", + "examples": [ + "maria", + "andrei", + "cristian", + "cristi" + ], + "reqs": [ + "hashcat", + "nume_bac2018.txt" + ], + "link": "/dict?dict=nume_bac2018.txt" + }, + { + "name": "dic_lc_eng", + "type": "wordlist", + "priority": 24, + "path": "dic_lc_eng.txt", + "wordsize": 63769, + "languages": [ + "english" + ], + "desc": "An English wordlist containing 63K words. Curated from top100K wiktionary words.", + "examples": [ + "something", + "enough", + "woman", + "house", + "night" + ], + "reqs": [ + "hashcat", + "dic_lc_eng.txt" + ], + "link": "/dict?dict=dic_lc_eng.txt" + }, + { + "name": "Simple123_rom", + "type": "john", + "priority": 30, + "path": "dic_lc_rom.txt", + "rule": "Simple123", + "wordsize": 1289424, + "languages": [ + "romanian" + ], + "desc": "Number (1, 12, 21, 123, 321) appending, prepending and capitalization on Romanian wordlist", + "examples": [ + "bautura123", + "Bautura123", + "Bautura1", + "123bautura", + "12Bautura" + ], + "reqs": [ + "hashcat", + "john", + "dic_lc_rom.txt", + "john-local.conf" + ], + "link": "" + }, + { + "name": "Simple123_bac", + "type": "john", + "priority": 32, + "path": "nume_bac2018.txt", + "rule": "Simple123", + "wordsize": 521188, + "languages": [ + "romanian" + ], + "desc": "Number (1, 12, 21, 123, 321) appending, prepending and capitalization on Romanian names", + "examples": [ + "gabriela123", + "Gabriela123", + "Gabriela1", + "123gabriela", + "12Gabriela" + ], + "reqs": [ + "hashcat", + "john", + "nume_bac2018.txt", + "john-local.conf" + ], + "link": "" + }, + { + "name": "Simple123_eng", + "type": "john", + "priority": 34, + "path": "dic_lc_eng.txt", + "rule": "Simple123", + "wordsize": 999964, + "languages": [ + "english" + ], + "desc": "Number (1, 12, 21, 123, 321) appending, prepending and capitalization on English words", + "examples": [ + "something123", + "Something123", + "Something1", + "123something", + "12Something" + ], + "reqs": [ + "hashcat", + "john", + "dic_lc_eng.txt", + "john-local.conf" + ], + "link": "" + }, + { + "name": "hibpv4_top10mil", + "type": "wordlist", + "priority": 38, + "path": "top_10mil_hibp.txt", + "wordsize": 10000000, + "languages": [ + "none" + ], + "desc": "Top 10 mil passwords in frequency from hibp v4", + "examples": [ + "love4ever", + "ilikepie", + "candy101", + "nietzsche" + ], + "reqs": [ + "hashcat", + "top_10mil_hibp.txt" + ], + "link": "" + }, + { + "name": "SimpleNums_rom", + "type": "john", + "priority": 40, + "path": "dic_lc_rom.txt", + "rule": "SimpleNums", + "wordsize": 57496000, + "languages": [ + "romanian" + ], + "desc": "Number ([0-9], [0-9][0-9]) appending, prepending and capitalization on Romanian words. Does not overlap with Simple123_rom.", + "examples": [ + "bautura64", + "Bautura13", + "Bautura18", + "69bautura", + "42Bautura" + ], + "reqs": [ + "hashcat", + "john", + "dic_lc_rom.txt", + "john-local.conf" + ], + "link": "" + }, + { + "name": "SimpleNums_bac", + "type": "john", + "priority": 42, + "path": "nume_bac2018.txt", + "rule": "SimpleNums", + "wordsize": 10567244, + "languages": [ + "romanian" + ], + "desc": "Number ([0-9], [0-9][0-9]) appending, prepending and capitalization on Romanian names. Does not overlap with Simple123_bac.", + "examples": [ + "gabriela64", + "Gabriela13", + "Gabriela18", + "69gabriela", + "42Gabriela" + ], + "reqs": [ + "hashcat", + "john", + "nume_bac2018.txt", + "john-local.conf" + ], + "link": "" + }, + { + "name": "SimpleNums_eng", + "type": "john", + "priority": 44, + "path": "dic_lc_eng.txt", + "rule": "SimpleNums", + "wordsize": 20527108, + "languages": [ + "english" + ], + "desc": "Number ([0-9], [0-9][0-9]) appending, prepending and capitalization on English words. Does not overlap with Simple123_eng.", + "examples": [ + "something64", + "Something13", + "Something18", + "69something", + "42Something" + ], + "reqs": [ + "hashcat", + "john", + "dic_lc_eng.txt", + "john-local.conf" + ], + "link": "" + }, + { + "name": "close_dates_sep", + "type": "generated", + "priority": 50, + "path": "date_generator.py", + "command": " 1900 2100 -./\\:_ yes", + "wordsize": 1862328, + "languages": [ + "none" + ], + "desc": "All dates between years 1900-2100 using formats (DDsMMsYYYY, MMsDDsYYYY, YYYYsMMsDD, DDsMMsYY, MMsDDsYY, YYsMMsDD) s='-./\\:_'", + "examples": [ + "17-10-90", + "05/31/2005", + "10-12-1901", + "2099:12:12" + ], + "reqs": [ + "hashcat", + "date_generator.py" + ], + "link": "" + }, + { + "name": "Jumbo_bac", + "type": "john", + "priority": 55, + "path": "nume_comune_bac2018.txt", + "rule": "Jumbo", + "wordsize": 13795144, + "languages": [ + "romanian" + ], + "desc": "John the ripper Jumbo rules applied on more popular romanian names", + "examples": [], + "reqs": [ + "hashcat", + "john", + "nume_comune_bac2018.txt", + "john-local.conf" + ], + "link": "/dict?dict=nume_comune_bac2018.txt" + }, + { + "name": "Jumbo_eng", + "type": "john", + "priority": 57, + "path": "top_2500_engl.txt", + "rule": "Jumbo", + "wordsize": 13810205, + "languages": [ + "english" + ], + "desc": "John the ripper Jumbo rules applied on top 2500 English words (at least 4 letters)", + "examples": [], + "reqs": [ + "hashcat", + "john", + "top_2500_engl.txt", + "john-local.conf" + ], + "link": "/dict?dict=top_2500_engl.txt" + }, + { + "name": "RugeoSamona", + "type": "john", + "priority": 58, + "path": "nume_bac2018.txt", + "rule": "RugeoSamona", + "wordsize": 28707702, + "languages": [ + "romanian" + ], + "desc": "Custom rules that involve mostly blowjobs", + "examples": [ + "PuieMonta", + "SugeoRamona", + "MuieAndrei", + "muieana1" + ], + "reqs": [ + "hashcat", + "john", + "nume_bac2018.txt", + "john-local.conf" + ], + "link": "" + }, + { + "name": "historic_dates_sep", + "type": "generated", + "priority": 60, + "path": "date_generator.py", + "command": " 0 1900 -./\\:_", + "wordsize": 12300600, + "languages": [ + "none" + ], + "desc": "All dates between years 0-1900 using formats (DDsMMsYYYY, MMsDDsYYYY, YYYYsMMsDD) s='-./\\:_'", + "examples": [ + "17-10-1500", + "05/31/1100", + "10-12-0020", + "1800:12:12" + ], + "reqs": [ + "hashcat", + "date_generator.py" + ], + "link": "" + }, + { + "name": "SimpleSymbol_rom", + "type": "john", + "priority": 70, + "path": "dic_lc_rom.txt", + "rule": "SimpleSymbol", + "wordsize": 7049216, + "languages": [ + "romanian" + ], + "desc": "Symbol appending, prepending and capitalization on Romanian words", + "examples": [ + "bautura*", + "Bautura(", + "&bautura", + ":Bautura" + ], + "reqs": [ + "hashcat", + "john", + "dic_lc_rom.txt", + "john-local.conf" + ], + "link": "" + }, + { + "name": "SimpleSymbol_bac", + "type": "john", + "priority": 72, + "path": "nume_bac2018.txt", + "rule": "SimpleSymbol", + "wordsize": 2280576, + "languages": [ + "romanian" + ], + "desc": "Symbol appending, prepending and capitalization on Romanian names", + "examples": [ + "gabriela*", + "Gabriela,", + "+gabriela", + "+Gabriela" + ], + "reqs": [ + "hashcat", + "john", + "nume_bac2018.txt", + "john-local.conf" + ], + "link": "" + }, + { + "name": "SimpleSymbol_eng", + "type": "john", + "priority": 74, + "path": "dic_lc_eng.txt", + "rule": "SimpleSymbol", + "wordsize": 4889472, + "languages": [ + "english" + ], + "desc": "Symbol appending, prepending and capitalization on English words", + "examples": [ + "something*", + "Something*", + "Something-", + ".Something" + ], + "reqs": [ + "hashcat", + "john", + "dic_lc_eng.txt", + "john-local.conf" + ], + "link": "" + }, + { + "name": "SimpleYears_rom", + "type": "john", + "priority": 80, + "path": "dic_lc_rom.txt", + "rule": "SimpleYears", + "wordsize": 57496000, + "languages": [ + "romanian" + ], + "desc": "Recent years (19[0-9][0-9], 20[0-9][0-9]) appending, prepending and capitalization on Romanian words", + "examples": [ + "bautura1993", + "Bautura1943", + "2001bautura", + "2099Bautura" + ], + "reqs": [ + "hashcat", + "john", + "dic_lc_rom.txt", + "john-local.conf" + ], + "link": "" + }, + { + "name": "SimpleYears_bac", + "type": "john", + "priority": 82, + "path": "nume_bac2018.txt", + "rule": "SimpleYears", + "wordsize": 26488000, + "languages": [ + "romanian" + ], + "desc": "Recent years (19[0-9][0-9], 20[0-9][0-9]) appending, prepending and capitalization on Romanian names", + "examples": [ + "gabriela1947", + "Gabriela1919", + "Gabriela2050", + "2019Gabriela" + ], + "reqs": [ + "hashcat", + "john", + "nume_bac2018.txt", + "john-local.conf" + ], + "link": "" + }, + { + "name": "SimpleYears_eng", + "type": "john", + "priority": 84, + "path": "dic_lc_eng.txt", + "rule": "SimpleYears", + "wordsize": 49391200, + "languages": [ + "english" + ], + "desc": "Recent years (19[0-9][0-9], 20[0-9][0-9]) appending, prepending and capitalization on English words", + "examples": [ + "something2001", + "Something2006", + "Something1992", + "2011something" + ], + "reqs": [ + "hashcat", + "john", + "dic_lc_eng.txt", + "john-local.conf" + ], + "link": "" + }, + { + "name": "future_dates_sep", + "type": "generated", + "priority": 90, + "path": "date_generator.py", + "command": " 2100 10000 -./\\:_", + "wordsize": 51144600, + "languages": [ + "none" + ], + "desc": "All dates between years 2100-9999 using formats (DDsMMsYYYY, MMsDDsYYYY, YYYYsMMsDD) s='-./\\:_'", + "examples": [ + "17-10-3001", + "05/31/6050", + "10-12-3030", + "4242:12:12" + ], + "reqs": [ + "hashcat", + "date_generator.py" + ], + "link": "" + }, + { + "name": "EightDigits", + "type": "mask_hashcat", + "priority": 100, + "path": "", + "mask_hashcat": "?d?d?d?d?d?d?d?d", + "wordsize": 100000000, + "languages": [ + "none" + ], + "desc": "All eight digits numbers", + "examples": [ + "17592044", + "62540891", + "77554310", + "12385977" + ], + "reqs": [ + "hashcat" + ], + "link": "" + }, + { + "name": "RoPhone", + "type": "mask_hashcat", + "priority": 102, + "path": "", + "mask_hashcat": "07?d?d?d?d?d?d?d?d", + "wordsize": 100000000, + "languages": [ + "none" + ], + "desc": "All 10 digits phone numbers starting with '07'", + "examples": [ + "0745192433", + "0723182955", + "0744103020", + "0712385977" + ], + "reqs": [ + "hashcat" + ], + "link": "" + }, + { + "name": "ThreeYears", + "type": "filemask_hashcat", + "priority": 108, + "path": "three_years.hcmask", + "wordsize": 8000000, + "languages": [ + "none" + ], + "desc": "Concatenate three times the years from 1900 to 2099 without separators", + "examples": [ + "197519751975", + "200320061990", + "209919002099", + "200220022002" + ], + "reqs": [ + "hashcat", + "three_years.hcmask" + ], + "link": "" + }, + { + "name": "hibpv4_10-100mil", + "type": "wordlist", + "priority": 110, + "path": "top_10-100mil_hibp.txt", + "wordsize": 90000000, + "languages": [ + "none" + ], + "desc": "Passwords from 10 to 100 mil in frequency from hibp v4", + "examples": [ + "horcrux9", + "hehemanh", + "dischead", + "zvezdec123" + ], + "reqs": [ + "hashcat", + "top_10-100mil_hibp.txt" + ], + "link": "" + }, + { + "name": "hibpv4_rest", + "type": "wordlist", + "priority": 115, + "path": "rest_hibp.txt", + "wordsize": 335309807, + "languages": [ + "none" + ], + "desc": "The rest of the passwords from hibp v4", + "examples": [ + "zucadume", + "z1280136311", + "shania1357", + "minhuck16" + ], + "reqs": [ + "hashcat", + "rest_hibp.txt" + ], + "link": "" + }, + { + "name": "SimpleSymbolNum_rom", + "type": "john", + "priority": 120, + "path": "dic_lc_rom.txt", + "rule": "SimpleSymbolNum", + "wordsize": 163873280, + "languages": [ + "romanian" + ], + "desc": "Symbol and number (both orders) appending, prepending and capitalization on Romanian words", + "examples": [ + "bautura6*", + "Bautura(6", + "Bautura3)", + "1&bautura", + ":3Bautura" + ], + "reqs": [ + "hashcat", + "john", + "dic_lc_rom.txt", + "john-local.conf" + ], + "link": "" + }, + { + "name": "SimpleSymbolNum_bac", + "type": "john", + "priority": 122, + "path": "nume_bac2018.txt", + "rule": "SimpleSymbolNum", + "wordsize": 64821760, + "languages": [ + "romanian" + ], + "desc": "Symbol and number (both orders) appending, prepending and capitalization on Romanian names", + "examples": [ + "gabriela*1", + "Gabriela/5", + "+5gabriela", + "9+Gabriela" + ], + "reqs": [ + "hashcat", + "john", + "nume_bac2018.txt", + "john-local.conf" + ], + "link": "" + }, + { + "name": "SimpleSymbolNum_eng", + "type": "john", + "priority": 124, + "path": "dic_lc_eng.txt", + "rule": "SimpleSymbolNum", + "wordsize": 125073920, + "languages": [ + "english" + ], + "desc": "Symbol and number (both orders) appending, prepending and capitalization on English words", + "examples": [ + "something1*", + "Something(8", + "7*something", + "<3Something" + ], + "reqs": [ + "hashcat", + "john", + "dic_lc_eng.txt", + "john-local.conf" + ], + "link": "" + }, + { + "name": "WikiSpanish", + "type": "wordlist", + "priority": 104, + "path": "eswiki.txt", + "wordsize": 36419672, + "languages": [ + "spanish" + ], + "desc": "Spanish words from wikipedia dumps (english alphabet, without special characters)", + "examples": [ + "gracias", + "hablo", + "manana" + ], + "reqs": [ + "hashcat", + "eswiki.txt" + ], + "link": "/dict?dict=eswiki.txt" + }, + { + "name": "WikiFrench", + "type": "wordlist", + "priority": 105, + "path": "frwiki.txt", + "languages": [ + "french" + ], + "wordsize": 41102851, + "desc": "French words from wikipedia dumps (english alphabet, without special characters)", + "examples": [ + "bonjour", + "garcon", + "chouette" + ], + "reqs": [ + "hashcat", + "frwiki.txt" + ], + "link": "/dict?dict=frwiki.txt" + }, + { + "name": "WikiGerman", + "type": "wordlist", + "priority": 106, + "path": "dewiki.txt", + "languages": [ + "deutsch" + ], + "wordsize": 139933096, + "desc": "German words from wikipedia dumps (english alphabet, without special characters)", + "examples": [ + "lacheln", + "Deutscher", + "Gluck" + ], + "reqs": [ + "hashcat", + "dewiki.txt" + ], + "link": "/dict?dict=dewiki.txt" + }, + { + "name": "WikiSpanishTitles", + "type": "wordlist", + "priority": 107, + "path": "titles-eswiki.txt", + "languages": [ + "spanish" + ], + "wordsize": 36419672, + "desc": "Spanish words from wikipedia dumps (english alphabet, without special characters)", + "examples": [ + "gracias", + "hablo", + "manana" + ], + "reqs": [ + "hashcat", + "titles-eswiki.txt" + ], + "link": "/dict?dict=titles-eswiki.txt" + }, + { + "name": "WikiFrenchTitles", + "type": "wordlist", + "priority": 109, + "path": "titles-frwiki.txt", + "languages": [ + "french" + ], + "wordsize": 41102851, + "desc": "French words from wikipedia dumps (english alphabet, without special characters)", + "examples": [ + "bonjour", + "garcon", + "chouette" + ], + "reqs": [ + "hashcat", + "titles-frwiki.txt" + ], + "link": "/dict?dict=titles-frwiki.txt" + }, + { + "name": "WikiGermanTitles", + "type": "wordlist", + "priority": 200, + "path": "titles-dewiki.txt", + "wordsize": 139933096, + "languages": [ + "romanian", + "deutsch", + "spanish" + ], + "desc": "German words from wikipedia dumps (english alphabet, without special characters)", + "examples": [ + "lacheln", + "Deutscher", + "Gluck" + ], + "reqs": [ + "hashcat", + "titles-dewiki.txt" + ], + "link": "/dict?dict=titles-dewiki.txt" + } ] \ No newline at end of file diff --git a/backend/source/api.py b/backend/source/api.py index 17dbf8c..7382416 100644 --- a/backend/source/api.py +++ b/backend/source/api.py @@ -320,7 +320,6 @@ def checkfile_v1(**kwargs): @require_key def getmissing_v1(**_): client_capabilities = request.json.get("capabilities") - if client_capabilities is None: return jsonify({"success": False, "reason": "Capabilities were not sent!"}) @@ -331,6 +330,7 @@ def getmissing_v1(**_): for req in rule["reqs"]: if req in client_capabilities and req not in rule_reqs: rule_reqs.add(req) + if req not in Configuration.programs and \ client_capabilities[req] != Configuration.cap_dict[req]["sha1"]: diff --git a/backend/source/backend.py b/backend/source/backend.py index 4a12787..a80ea86 100755 --- a/backend/source/backend.py +++ b/backend/source/backend.py @@ -1,7 +1,13 @@ +import os import random import string +import subprocess from werkzeug.exceptions import abort +import json + +from werkzeug.utils import secure_filename + from .config import Configuration from .user import User, enc_bcrypt from .wrappers import is_admin, requires_admin, check_db_conn, ajax_requires_admin @@ -45,12 +51,35 @@ def get_uncracked_tuple(document): result["mac"] = handshake["MAC"] result["hs_type"] = handshake["handshake_type"] result["date_added"] = document["date_added"].strftime('%H:%M - %d.%m.%Y') + if handshake["active"]: result["tried_rules"] = "Trying rule %s" % document["reserved"]["tried_rule"] result["eta"] = handshake["eta"] - else: - result["tried_rules"] = "%s/%s" % (len(handshake["tried_dicts"]), Configuration.number_rules) - result["eta"] = "" + return result + + try: + with open('rules') as json_data: + rules = json.load(json_data) + except Exception as e: + Configuration.log_fatal("Error trying to load rules data: %s" % e) + + valid_rules = 0 + for rule in rules: + valid = False + for rule_language in rule["languages"]: + if rule_language == 'none': + valid_rules = valid_rules + 1 + break + for hs_language in document["languages"]: + if hs_language == rule_language: + valid_rules = valid_rules + 1 + valid = True + break + if valid: + break + + result["tried_rules"] = "%s/%s" % (len(handshake["tried_dicts"]), int(valid_rules)) + result["eta"] = "" return result @@ -93,6 +122,266 @@ def admin_panel(): abort(404) +# @blob_api.route('/delete_rule/', methods=['GET']) +# @ajax_requires_admin +# def delete_rule(name): +# rules = [] +# +# for rule in Configuration.get_active_rules(): +# if rule["name"] == name: +# Configuration.rule_dict.pop(name, 'No Key found') +# else: +# rules.append(rule) +# +# with open('rules', 'w') as filehandle: +# json.dump(rules, filehandle, indent=4) +# +# return render_template('rules.html', rules=rules) + + +@blob_api.route('/rules/', methods=['GET']) +@requires_admin +def handle_rules(): + return render_template('rules.html', rules=Configuration.get_active_rules()) + + +@blob_api.route('/delete_rule/', methods=['POST']) +@requires_admin +def delete_rule(): + rule_to_delete = request.form.get("rule_to_delete") + + if rule_to_delete in Configuration.rule_dict: + del Configuration.rule_dict[rule_to_delete] + else: + Configuration.logger.info("No key found") + + with open('rules', 'w') as filehandle: + json.dump(Configuration.get_active_rules(), filehandle, indent=4) + + return redirect(url_for("blob_api.handle_rules")) + + +@blob_api.route('/edit_rule/', methods=['GET', 'POST']) +@requires_admin +def edit_rule(name): + rule_to_edit = Configuration.rule_dict[name] + priorities = [] + names = [] + + for rule in Configuration.get_active_rules(): + priorities.append(rule["priority"]) + names.append(rule["name"]) + + if request.method == 'GET': + if check_db_conn() is None: + flash("DATABASE IS DOWN!") + return render_template('rules.html', rules=Configuration.get_active_rules()) + + if rule_to_edit == {}: + return render_template('rules.html', rules=Configuration.get_active_rules()) + else: + return render_template('edit_rule.html', priorities=priorities, names=names, rule=rule_to_edit) + elif request.method == 'POST': + new_rule = {"name": request.form.get("name", None), "type": request.form.get("type", None), + "priority": int(request.form.get("priority", None)), "path": request.form.get("path", None)} + + if new_rule["type"] == "wordlist" or new_rule["type"] == "filemask_hashcat": + new_rule["wordsize"] = int(request.form.get("wordsize", None)) + elif new_rule["type"] == "john": + new_rule["rule"] = request.form.get("rule", None) + new_rule["wordsize"] = int(request.form.get("wordsize", None)) + elif new_rule["type"] == "generated": + new_rule["command"] = request.form.get("command", None) + new_rule["wordsize"] = int(request.form.get("wordsize", None)) + elif new_rule["type"] == "mask_hashcat": + new_rule["mask_hashcat"] = request.form.get("pattern", None) + new_rule["wordsize"] = int(request.form.get("wordsize", None)) + elif new_rule["type"] == "scrambler": + new_rule["wordsize"] = 0 + new_rule["path"] = "" + + # Languages - all languages/no language selected = none + new_rule["languages"] = request.form.getlist("languages", None) + if len(new_rule["languages"]) == 0 or len(new_rule["languages"]) == Configuration.nr_languages: + new_rule["languages"] = "none" + + # Description of the rule + new_rule["desc"] = request.form.get("description", None) + + # Examples from dictionary + str_examples = request.form.get("examples", None) + examples = list(str_examples.split(",")) + examples.pop() + new_rule["examples"] = examples + + # Initialise reqs + new_rule["reqs"] = request.form.getlist("reqs", None) + + # Link + new_rule["link"] = request.form.get("link", None) + + # Save the dictionary and process it for the path and wordsize + # available for the types "wordlist", "john", "generated", "filemask_hashcat" + if 'file' in request.files: + dictionary = request.files.get("file") + filename = secure_filename(dictionary.filename) + + if filename != '': + # Save dictionary + dictionary.save(os.path.join("static/crack", filename)) + os.system("cp " + "static/crack/" + filename + " ../cracker/dict") + + # Compute wordsize and path + p1 = subprocess.Popen(["wc", "-l", "static/crack/" + filename], stdout=subprocess.PIPE) + p2 = subprocess.Popen(["awk", '{print $1}'], stdin=p1.stdout, stdout=subprocess.PIPE, universal_newlines=True) + if new_rule["type"] != "generated": + new_rule["wordsize"] = p2.stdout.read()[:-1] + new_rule["path"] = filename + else: + if new_rule["path"] == "": + flash("You forgot to upload the file, didn't you? Try again!") + return render_template('edit_rule.html', priorities=priorities, names=names, rule=rule_to_edit) + + # Requirements for cracking + if new_rule["type"] == "scrambler": + new_rule["reqs"] = ["john", "hashcat"] + elif new_rule["type"] == "john": + new_rule["reqs"] = ["hashcat", "john", new_rule["path"], "john-local.conf"] + elif new_rule["type"] == "mask_hashcat": + new_rule["reqs"] = ["hashcat"] + else: + new_rule["reqs"] = ["hashcat", new_rule["path"]] + + # Modify rule configuration if changes were made + rules = [] + + if new_rule["name"] not in Configuration.rule_dict: + # The name changed => add a new rule + Configuration.rule_dict[new_rule["name"]] = new_rule + rules = Configuration.get_active_rules() + else: + # Rule name is the same => edit the actual rule (if necessary) + shared_items = {k: rule_to_edit[k] for k in rule_to_edit if k in new_rule and rule_to_edit[k] == new_rule[k]} + if len(shared_items) == len(rule_to_edit.keys()): + # No changes were made + return render_template('rules.html', rules=Configuration.get_active_rules()) + else: + # Changes were made + Configuration.rule_dict[new_rule["name"]] = new_rule + for rule in Configuration.get_active_rules(): + if rule["name"] == new_rule["name"]: + rules.append(new_rule) + else: + rules.append(rule) + + # Write the changes in the rules file + with open('rules', 'w') as filehandle: + json.dump(rules, filehandle, indent=4) + + return redirect(url_for("blob_api.handle_rules")) + # return render_template('rules.html', rules=rules) + + else: + Configuration.logger.error("Unsupported method!") + abort(404) + + +@blob_api.route('/add_rule/', methods=['GET', 'POST']) +@requires_admin +def add_rule(): + if request.method == 'GET': + if check_db_conn() is None: + flash("DATABASE IS DOWN!") + return render_template('rules.html', rules=Configuration.get_active_rules()) + + priorities = [] + names = [] + for rule in Configuration.get_active_rules(): + priorities.append(rule["priority"]) + names.append(rule["name"]) + return render_template('add_rule.html', priorities=priorities, names=names) + + elif request.method == 'POST': + new_rule = {"name": request.form.get("name", None), "type": request.form.get("type", None), + "priority": int(request.form.get("priority", None)), "path": ""} + + if new_rule["type"] == "wordlist" or new_rule["type"] == "filemask_hashcat": + new_rule["wordsize"] = 0 + elif new_rule["type"] == "john": + new_rule["rule"] = request.form.get("rule", None) + new_rule["wordsize"] = 0 + elif new_rule["type"] == "generated": + new_rule["command"] = request.form.get("command", None) + new_rule["wordsize"] = 0 # modify + elif new_rule["type"] == "mask_hashcat": + new_rule["mask_hashcat"] = request.form.get("pattern", None) + new_rule["wordsize"] = 0 # modify + elif new_rule["type"] == "scrambler": + new_rule["wordsize"] = 0 + + # Languages - all languages/no language selected = none + new_rule["languages"] = request.form.getlist("languages", None) + if len(new_rule["languages"]) == 0 or len(new_rule["languages"]) == Configuration.nr_languages: + new_rule["languages"] = "none" + + # Description of the rule + new_rule["desc"] = request.form.get("description", None) + + # Examples from dictionary + new_rule["examples"] = request.form.getlist("examples", None) + + # Initialise reqs + new_rule["reqs"] = [] + + # Link + new_rule["link"] = request.form.get("link", None) + + # Save the dictionary and get the path + wordsize + if new_rule["type"] in ["wordlist", "john", "generated", "filemask_hashcat"]: + if 'file' not in request.files: + Configuration.logger.info("No file uploaded.") + flash("No file uploaded.") + return redirect(request.url) + + # Save dictionary + dictionary = request.files.get("file") + filename = secure_filename(dictionary.filename) + dictionary.save(os.path.join("static/crack", filename)) + os.system("cp " + "static/crack/" + filename + " ../cracker/dict") + + # Compute path and wordsize + p1 = subprocess.Popen(["wc", "-l", "static/crack/" + filename], stdout=subprocess.PIPE) + p2 = subprocess.Popen(["awk", '{print $1}'], stdin=p1.stdout, stdout=subprocess.PIPE, universal_newlines=True) + new_rule["path"] = filename + if new_rule["type"] != "generated": + new_rule["wordsize"] = p2.stdout.read()[:-1] + + # Requirements for cracking + if new_rule["type"] == "scrambler": + new_rule["reqs"] = ["john", "hashcat"] + elif new_rule["type"] == "john": + new_rule["reqs"] = ["hashcat", "john", new_rule["path"], "john-local.conf"] + elif new_rule["type"] == "mask_hashcat": + new_rule["reqs"] = ["hashcat"] + else: + new_rule["reqs"] = ["hashcat", new_rule["path"]] + + # Save rule in the configuration + Configuration.rule_dict[new_rule["name"]] = new_rule + rules = Configuration.get_active_rules() + rules.append(new_rule) + + # Add the new rule to rules file + with open('rules', 'w') as filehandle: + json.dump(rules, filehandle, indent=4) + + # return render_template('rules.html', rules=rules) + return redirect(url_for("blob_api.handle_rules")) + else: + Configuration.logger.error("Unsupported method!") + abort(404) + + @blob_api.route('/', methods=['GET']) def home(): if is_admin(current_user): diff --git a/backend/source/config.py b/backend/source/config.py index 8e70e52..7118195 100644 --- a/backend/source/config.py +++ b/backend/source/config.py @@ -44,7 +44,7 @@ class Configuration(object): "location": {"address": "", "city": "", "coordinates": [0.0, 0.0], - "keyword": ""}, + "keyword": []}, "path": None, "handshake": None, @@ -52,6 +52,7 @@ class Configuration(object): "users": [], "priority": 0, + "languages": [], "details": ""} default_handshake = { # Metadata @@ -72,6 +73,9 @@ class Configuration(object): # Handshake save folder save_file_location = 'handshakes' + # Number of languages + nr_languages = 5 + # Handshake verification pmkid_16800_regex = re.compile("^[0-9a-f]{32}\\*([0-9a-f]{12})\\*[0-9a-f]{12}\\*([0-9a-f]*)[\n]?$") username_regex = re.compile("^[a-zA-Z][-_.0-9a-zA-Z]*$") @@ -293,6 +297,7 @@ def get_mtime_for_cap_file(path): path = os.path.join(Configuration.static_folder, "crack", path.split("/")[-1]) if not os.path.exists(path): + Configuration.logger.error("Could not get mtime. File not found: %s" % path) return None, path return os.stat(path).st_mtime, path @@ -362,7 +367,7 @@ def load_data(file): Configuration.logger.info("File '%s' was updated, reloading data" % name) Configuration.set_cap_dict_data(name, data, final_dict) else: - Configuration.logger.debug("Loaded data for '%s' from generated file." % name) + Configuration.logger.info("Loaded data for '%s' from generated file." % name) final_dict[name] = deepcopy(old_cap_dict[name]) # Data for this entry is not present in generated file else: diff --git a/backend/source/scheduler.py b/backend/source/scheduler.py index 238886a..a2780fa 100644 --- a/backend/source/scheduler.py +++ b/backend/source/scheduler.py @@ -11,20 +11,41 @@ class Scheduler: - # prios = [{rule_name: rule_prio}] - priorities only for possible rules + # attributes = [{rule_name: rule_prio, rule_language}] - priorities only for possible rules # crt = [{tried_rule_name: 1}] - all tried rules for current hs mapper_template = "function() {" \ - " var prios = %s;" \ + " var attributes = %s;" \ " var result_name = '';" \ " var result_prio = 900000;" \ " var crt = {};" \ + " for (var name in attributes) { " \ + " var valid_rule = false;" \ + " for (var rule_language in attributes[name][1]) {" \ + " if (attributes[name][1][rule_language] === 'none') { " \ + " valid_rule = true;" \ + " break;" \ + " }" \ + " for (var hs_language in this['languages']) { " \ + " if (attributes[name][1][rule_language] === this['languages'][hs_language]) { " \ + " valid_rule = true;" \ + " break;" \ + " }" \ + " }" \ + " if (valid_rule === true) {" \ + " break;" \ + " }" \ + " }" \ + " if (valid_rule === false) {" \ + " delete attributes[name];" \ + " }" \ + " }" \ " for (var iter in this['handshake']['tried_dicts']) {" \ " crt[this['handshake']['tried_dicts'][iter]] = 1;" \ " }" \ - " for (var name in prios) {" \ - " if ( crt[name] !== 1 && prios[name] < result_prio) {" \ + " for (var name in attributes) {" \ + " if (crt[name] !== 1 && attributes[name][0] < result_prio) {" \ " result_name = name;" \ - " result_prio = prios[name];" \ + " result_prio = attributes[name][0];" \ " }" \ " }" \ " var result = {};" \ @@ -189,7 +210,7 @@ def _get_22000_data(crt_capture): def get_22000_data(crt_capture): """ This is a temporary fix needed until a better result checking - method is implemented. This should be removed as soon as possible. + method is implemented. This should be removed as soon as po ssible. Do not use this method. :param crt_capture: :return: @@ -197,6 +218,7 @@ def get_22000_data(crt_capture): intermediary = dict() intermediary['date_added'] = crt_capture['date_added'] intermediary['priority'] = crt_capture['priority'] + intermediary['languages'] = crt_capture['languages'] intermediary['id'] = crt_capture['id'] intermediary['path'] = crt_capture['path'] intermediary['file_type'] = crt_capture['file_type'] @@ -225,7 +247,7 @@ def get_all_possible_rules(client_capabilities): if not_good: continue - result[rule_name] = rule["priority"] + result[rule_name] = [rule["priority"], rule["languages"]] return result @staticmethod @@ -243,6 +265,7 @@ def get_next_handshake(apikey, client_capabilities): # Lock this in order to ensure that multiple threads do not reserve the same handshake with Configuration.wifis_lock: mapper = Code(Scheduler.mapper_template % Scheduler.get_all_possible_rules(client_capabilities)) + try: response = Configuration.wifis.map_reduce(mapper, Scheduler.reducerf, {"inline": 1}, query=query) except Exception as e: diff --git a/backend/source/upload.py b/backend/source/upload.py index 02b2a05..bfeb714 100644 --- a/backend/source/upload.py +++ b/backend/source/upload.py @@ -4,6 +4,7 @@ import string import random import shutil +import re from .process import Process from .config import Configuration from .wrappers import die, not_admin, check_db_conn @@ -14,6 +15,8 @@ from flask_login import login_required, current_user from copy import deepcopy +from geopy.geocoders import Nominatim + upload_api = Blueprint('upload_api', __name__) @@ -325,7 +328,23 @@ def upload_file(): return redirect(request.url) files = request.files.getlist('file') - Configuration.logger.info(files) + languages = request.form.getlist('language') + coordinates = request.form.getlist('coord') + suggested_passwords = request.form.getlist('tags') + + # Convert list of coordinates to a dict of coordinates + coord = [0.0, 0.0] + location = "" + address = "" + + if coordinates != ['']: + coord_lst = re.split("[, :]+", coordinates[0]) + coord_dct = {coord_lst[i]: coord_lst[i + 1] for i in range(0, len(coord_lst), 2)} + locator = Nominatim(user_agent="myGeocoder") + coord = [coord_dct["Lat"], coord_dct["Lon"]] + location = locator.reverse(coord) + address = location.raw['address'] + # Check for empty filename if len(files) == 0: Configuration.logger.info("No selected file.") @@ -346,11 +365,20 @@ def upload_file(): new_entry = deepcopy(Configuration.default_wifi) - # new_entry["location"]["keyword"] = #TODO POST keyword - # new_entry["location"]["coordinates"] = #TODO POST coordinates + if suggested_passwords != ['']: + new_entry["location"]["keyword"] = suggested_passwords + + if location != "": + new_entry["location"]["address"] = location.address + + if address != "": + new_entry["location"]["city"] = address.get('city', '') + + new_entry["location"]["coordinates"] = coord new_entry["date_added"] = datetime.datetime.now() new_entry["users"] = [current_user.get_id()] new_entry["priority"] = 0 + new_entry["languages"] = languages # Validate handshake and get file type and handshake type valid_handshake, wifi_entries = check_handshake(tmp_path, file.filename, new_entry) diff --git a/backend/static/hs_add.js b/backend/static/hs_add.js new file mode 100644 index 0000000..7a08376 --- /dev/null +++ b/backend/static/hs_add.js @@ -0,0 +1,90 @@ +// selectize input for the tags +$(document).ready(function() { + $('#tags').selectize({ + plugins: ['remove_button'], + delimiter: ',', + persist: true, + diacritics: false, + maxItems: 100, + create: function(input) { + return { + value: input, + text: input + } + } + }); + +require([ + "esri/Map", + "esri/views/MapView", + "esri/widgets/Search", +], function(Map, MapView, Search) { + + var map = new Map({ + basemap: "topo-vector" + }); + + var view = new MapView({ + container: "viewDiv", + map: map, + center: [-118.80543,34.02700], + zoom: 13 + }); + + //*** Add div element to show coordates ***// + var coordsWidget = document.createElement("div"); + coordsWidget.id = "coordsWidget"; + coordsWidget.className = "esri-widget esri-component"; + coordsWidget.style.padding = "7px 15px 5px"; + view.ui.add(coordsWidget, "bottom-right"); + + //*** Update lat, lon, zoom and scale ***// + function showCoordinates(pt) { + var coords = "Lat/Lon " + pt.latitude.toFixed(3) + " " + pt.longitude.toFixed(3) + + " | Scale 1:" + Math.round(view.scale * 1) / 1 + + " | Zoom " + view.zoom; + coordsWidget.innerHTML = coords; + } + + //*** Add event and show center coordinates after the view is finished moving e.g. zoom, pan ***// + view.watch(["stationary"], function() { + showCoordinates(view.center); + }); + + //*** Add event to show mouse coordinates on click and move ***// + view.on(["pointer-down","pointer-move"], function(evt) { + showCoordinates(view.toMap({ x: evt.x, y: evt.y })); + }); + + //*** Add event to set coordinates on click in the textbox below the map ***// + view.on(["double-click"], function(evt) { + pt = view.toMap({ x: evt.x, y: evt.y }); + document.getElementsByName('coord')[0].value = "Lat: " + pt.latitude.toFixed(3) + ", " + "Lon: " + pt.longitude.toFixed(3); + }); + + //*** Add Search widget ***// + var search = new Search({ + view: view + }); + view.ui.add(search, "top-right"); // Add to the map + + //*** Find address ***// + view.on("click", function(evt){ + search.clear(); + view.popup.clear(); + if (search.activeSource) { + var geocoder = search.activeSource.locator; // World geocode service + var params = { + location: evt.mapPoint + }; + geocoder.locationToAddress(params) + .then(function(response) { // Show the address found + var address = response.address; + showPopup(address, evt.mapPoint); + }, function(err) { // Show no address found + showPopup("No address found.", evt.mapPoint); + }); + } + }); + }); +}); \ No newline at end of file diff --git a/backend/static/navbar.css b/backend/static/navbar.css index f78a827..1329955 100644 --- a/backend/static/navbar.css +++ b/backend/static/navbar.css @@ -24,4 +24,4 @@ ul.navbar li a:hover:not(.active) { ul.navbar li a.active { background-color: #111; -} +} \ No newline at end of file diff --git a/backend/static/rule_add.js b/backend/static/rule_add.js new file mode 100644 index 0000000..17a1f0d --- /dev/null +++ b/backend/static/rule_add.js @@ -0,0 +1,111 @@ +$(document).ready(function() { + $('#examples').selectize({ + plugins: ['remove_button'], + delimiter: ',', + persist: true, + diacritics: false, + maxItems: 100, + create: function(input) { + return { + value: input, + text: input + } + } + }); + + $(function() { + $('#languages').selectize({}); + }); + + function addLabel(path) { + let element = document.createElement('label'); + let elementPath = document.createElement('div'); + elementPath.innerHTML = path; + elementPath.style.fontWeight = 'bold'; + + element.htmlFor = path.toLowerCase();; + element.appendChild(elementPath); + + return element; + } + + function addTextbox(path, input_type) { + let element = document.createElement('input'); + element.type = input_type; + element.setAttribute('id', path.toLowerCase()); + element.setAttribute('name', path.toLowerCase()); + element.setAttribute('placeholder', "Enter rule " + path.toLowerCase()); + element.required = true; + + return element; + } + + function addDictionary() { + let element = document.createElement('input'); + element.setAttribute('type', "file"); + element.setAttribute('name', "file"); + element.setAttribute('id', "file"); + + return element; + } + + function addNewline() { + var br = document.createElement("br"); + return br; + } + + $('#type').on('change', function() { + let type = document.getElementById('type').value; + let dynamicClass = document.getElementById('dynamic'); + dynamicClass.innerHTML = ''; + + if (type === 'wordlist' || type === 'filemask_hashcat') { + var dictionary = [addLabel('Dictionary'), addDictionary(), addNewline(), addNewline()]; + for (var i = 0; i < dictionary.length; i++) { + dynamicClass.appendChild(dictionary[i]); + } + } + + if (type == 'john') { + var dictionary = [addLabel('Dictionary'), addDictionary(), addNewline(), addNewline()]; + for (var i = 0; i < dictionary.length; i++) { + dynamicClass.appendChild(dictionary[i]); + } + + var rule = [addLabel('Rule'), addTextbox('Rule', 'text'), addNewline(), addNewline()]; + for (var i = 0; i < rule.length; i++) { + dynamicClass.appendChild(rule[i]); + } + } + + if (type === 'generated') { + var command = [addLabel('Command'), addTextbox('Command', 'text'), addNewline(), addNewline()]; + for (var i = 0; i < command.length; i++) { + dynamicClass.appendChild(command[i]); + } + + var wordsize = [addLabel('Wordsize'), addTextbox('Wordsize', 'number'), addNewline(), addNewline()]; + for (var i = 0; i < wordsize.length; i++) { + dynamicClass.appendChild(wordsize[i]); + } + + var dictionary = [addLabel('Dictionary'), addDictionary(), addNewline(), addNewline()]; + for (var i = 0; i < dictionary.length; i++) { + dynamicClass.appendChild(dictionary[i]); + } + } + + if (type === 'mask_hashcat') { + var pattern = [addLabel('Pattern'), addTextbox('Pattern', 'text'), addNewline(), addNewline()]; + for (var i = 0; i < pattern.length; i++) { + dynamicClass.appendChild(pattern[i]); + } + + var wordsize = [addLabel('Wordsize'), addTextbox('Wordsize', 'number'), addNewline(), addNewline()]; + for (var i = 0; i < wordsize.length; i++) { + dynamicClass.appendChild(wordsize[i]); + } + } + }); + +}); \ No newline at end of file diff --git a/backend/static/rule_form.css b/backend/static/rule_form.css new file mode 100644 index 0000000..7287b13 --- /dev/null +++ b/backend/static/rule_form.css @@ -0,0 +1,47 @@ +.selectize-input { + width: 250px; +} + +select { + font-size: 13px; + font-family: dejavu serif; + color: #303030; + background: white; + border: 1px solid #d0d0d0; + padding: 8px 8px; + display: inline-block; + width: 250px; + overflow: hidden; + position: relative; + z-index: 1; + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; + -webkit-box-shadow: inset 0 1px 1px rgba(0,0,0,.1); + box-shadow: inset 0 1px 1px rgba(0,0,0,.1); + -webkit-border-radius: 3px; + -moz-border-radius: 3px; + border-radius: 3px; +} + +::-webkit-input-placeholder { /* Edge */ + font-family: "dejavu serif"; + font-size: 13px; + color: #303030; +} + +:-ms-input-placeholder { /* Internet Explorer */ + font-family: "dejavu serif"; + font-size: 13px; + color: #303030; +} + +::placeholder { + font-family: "dejavu serif"; + font-size: 13px; + color: #303030; +} + +#hide { + display: none; +} \ No newline at end of file diff --git a/backend/static/selectize.js b/backend/static/selectize.js new file mode 100644 index 0000000..c7b025d --- /dev/null +++ b/backend/static/selectize.js @@ -0,0 +1,3039 @@ +/** + * sifter.js + * Copyright (c) 2013 Brian Reavis & contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at: + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF + * ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + * + * @author Brian Reavis + */ + +(function(root, factory) { + if (typeof define === 'function' && define.amd) { + define(factory); + } else if (typeof exports === 'object') { + module.exports = factory(); + } else { + root.Sifter = factory(); + } +}(this, function() { + + /** + * Textually searches arrays and hashes of objects + * by property (or multiple properties). Designed + * specifically for autocomplete. + * + * @constructor + * @param {array|object} items + * @param {object} items + */ + var Sifter = function(items, settings) { + this.items = items; + this.settings = settings || {diacritics: true}; + }; + + /** + * Splits a search string into an array of individual + * regexps to be used to match results. + * + * @param {string} query + * @returns {array} + */ + Sifter.prototype.tokenize = function(query) { + query = trim(String(query || '').toLowerCase()); + if (!query || !query.length) return []; + + var i, n, regex, letter; + var tokens = []; + var words = query.split(/ +/); + + for (i = 0, n = words.length; i < n; i++) { + regex = escape_regex(words[i]); + if (this.settings.diacritics) { + for (letter in DIACRITICS) { + if (DIACRITICS.hasOwnProperty(letter)) { + regex = regex.replace(new RegExp(letter, 'g'), DIACRITICS[letter]); + } + } + } + tokens.push({ + string : words[i], + regex : new RegExp(regex, 'i') + }); + } + + return tokens; + }; + + /** + * Iterates over arrays and hashes. + * + * ``` + * this.iterator(this.items, function(item, id) { + * // invoked for each item + * }); + * ``` + * + * @param {array|object} object + */ + Sifter.prototype.iterator = function(object, callback) { + var iterator; + if (is_array(object)) { + iterator = Array.prototype.forEach || function(callback) { + for (var i = 0, n = this.length; i < n; i++) { + callback(this[i], i, this); + } + }; + } else { + iterator = function(callback) { + for (var key in this) { + if (this.hasOwnProperty(key)) { + callback(this[key], key, this); + } + } + }; + } + + iterator.apply(object, [callback]); + }; + + /** + * Returns a function to be used to score individual results. + * + * Good matches will have a higher score than poor matches. + * If an item is not a match, 0 will be returned by the function. + * + * @param {object|string} search + * @param {object} options (optional) + * @returns {function} + */ + Sifter.prototype.getScoreFunction = function(search, options) { + var self, fields, tokens, token_count; + + self = this; + search = self.prepareSearch(search, options); + tokens = search.tokens; + fields = search.options.fields; + token_count = tokens.length; + + /** + * Calculates how close of a match the + * given value is against a search token. + * + * @param {mixed} value + * @param {object} token + * @return {number} + */ + var scoreValue = function(value, token) { + var score, pos; + + if (!value) return 0; + value = String(value || ''); + pos = value.search(token.regex); + if (pos === -1) return 0; + score = token.string.length / value.length; + if (pos === 0) score += 0.5; + return score; + }; + + /** + * Calculates the score of an object + * against the search query. + * + * @param {object} token + * @param {object} data + * @return {number} + */ + var scoreObject = (function() { + var field_count = fields.length; + if (!field_count) { + return function() { return 0; }; + } + if (field_count === 1) { + return function(token, data) { + return scoreValue(data[fields[0]], token); + }; + } + return function(token, data) { + for (var i = 0, sum = 0; i < field_count; i++) { + sum += scoreValue(data[fields[i]], token); + } + return sum / field_count; + }; + })(); + + if (!token_count) { + return function() { return 0; }; + } + if (token_count === 1) { + return function(data) { + return scoreObject(tokens[0], data); + }; + } + return function(data) { + for (var i = 0, sum = 0; i < token_count; i++) { + sum += scoreObject(tokens[i], data); + } + return sum / token_count; + }; + }; + + /** + * Parses a search query and returns an object + * with tokens and fields ready to be populated + * with results. + * + * @param {string} query + * @param {object} options + * @returns {object} + */ + Sifter.prototype.prepareSearch = function(query, options) { + if (typeof query === 'object') return query; + return { + options : extend({}, options), + query : String(query || '').toLowerCase(), + tokens : this.tokenize(query), + total : 0, + items : [] + }; + }; + + /** + * Searches through all items and returns a sorted array of matches. + * + * The `options` parameter can contain: + * + * - fields {string|array} + * - sort {string} + * - direction {string} + * - score {function} + * - limit {integer} + * + * Returns an object containing: + * + * - options {object} + * - query {string} + * - tokens {array} + * - total {int} + * - items {array} + * + * @param {string} query + * @param {object} options + * @returns {object} + */ + Sifter.prototype.search = function(query, options) { + var self = this, value, score, search, calculateScore; + + search = this.prepareSearch(query, options); + options = search.options; + query = search.query; + + // generate result scoring function + if (!is_array(options.fields)) options.fields = [options.fields]; + calculateScore = options.score || self.getScoreFunction(search); + + // perform search and sort + if (query.length) { + self.iterator(self.items, function(item, id) { + score = calculateScore(item); + if (score > 0) { + search.items.push({'score': score, 'id': id}); + } + }); + search.items.sort(function(a, b) { + return b.score - a.score; + }); + } else { + self.iterator(self.items, function(item, id) { + search.items.push({'score': 1, 'id': id}); + }); + if (options.sort) { + search.items.sort((function() { + var field = options.sort; + var multiplier = options.direction === 'desc' ? -1 : 1; + return function(a, b) { + return cmp(self.items[a.id][field], self.items[b.id][field]) * multiplier; + }; + })()); + } + } + + // apply limits + search.total = search.items.length; + if (typeof options.limit === 'number') { + search.items = search.items.slice(0, options.limit); + } + + return search; + }; + + // utilities + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + var cmp = function(a, b) { + if (typeof a === 'number' && typeof b === 'number') { + return a > b ? 1 : (a < b ? -1 : 0); + } + a = String(a || '').toLowerCase(); + b = String(b || '').toLowerCase(); + if (a > b) return 1; + if (b > a) return -1; + return 0; + }; + + var extend = function(a, b) { + var i, n, k, object; + for (i = 1, n = arguments.length; i < n; i++) { + object = arguments[i]; + if (!object) continue; + for (k in object) { + if (object.hasOwnProperty(k)) { + a[k] = object[k]; + } + } + } + return a; + }; + + var trim = function(str) { + return (str + '').replace(/^\s+|\s+$|/g, ''); + }; + + var escape_regex = function(str) { + return (str + '').replace(/([.?*+^$[\]\\(){}|-])/g, '\\$1'); + }; + + var is_array = Array.isArray || ($ && $.isArray) || function(object) { + return Object.prototype.toString.call(object) === '[object Array]'; + }; + + var DIACRITICS = { + 'a': '[aÀÁÂÃÄÅàáâãäå]', + 'c': '[cÇç]', + 'e': '[eÈÉÊËèéêë]', + 'i': '[iÌÍÎÏìíîï]', + 'n': '[nÑñ]', + 'o': '[oÒÓÔÕÕÖØòóôõöø]', + 's': '[sŠš]', + 'u': '[uÙÚÛÜùúûü]', + 'y': '[yŸÿý]', + 'z': '[zŽž]' + }; + + // export + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + return Sifter; + +})); + +/** + * microplugin.js + * Copyright (c) 2013 Brian Reavis & contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at: + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF + * ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + * + * @author Brian Reavis + */ + +(function(root, factory) { + if (typeof define === 'function' && define.amd) { + define(factory); + } else if (typeof exports === 'object') { + module.exports = factory(); + } else { + root.MicroPlugin = factory(); + } +}(this, function() { + var MicroPlugin = {}; + + MicroPlugin.mixin = function(Interface) { + Interface.plugins = {}; + + /** + * Initializes the listed plugins (with options). + * Acceptable formats: + * + * List (without options): + * ['a', 'b', 'c'] + * + * List (with options): + * [{'name': 'a', options: {}}, {'name': 'b', options: {}}] + * + * Hash (with options): + * {'a': { ... }, 'b': { ... }, 'c': { ... }} + * + * @param {mixed} plugins + */ + Interface.prototype.initializePlugins = function(plugins) { + var i, n, key; + var self = this; + var queue = []; + + self.plugins = { + names : [], + settings : {}, + requested : {}, + loaded : {} + }; + + if (utils.isArray(plugins)) { + for (i = 0, n = plugins.length; i < n; i++) { + if (typeof plugins[i] === 'string') { + queue.push(plugins[i]); + } else { + self.plugins.settings[plugins[i].name] = plugins[i].options; + queue.push(plugins[i].name); + } + } + } else if (plugins) { + for (key in plugins) { + if (plugins.hasOwnProperty(key)) { + self.plugins.settings[key] = plugins[key]; + queue.push(key); + } + } + } + + while (queue.length) { + self.require(queue.shift()); + } + }; + + Interface.prototype.loadPlugin = function(name) { + var self = this; + var plugins = self.plugins; + var plugin = Interface.plugins[name]; + + if (!Interface.plugins.hasOwnProperty(name)) { + throw new Error('Unable to find "' + name + '" plugin'); + } + + plugins.requested[name] = true; + plugins.loaded[name] = plugin.fn.apply(self, [self.plugins.settings[name] || {}]); + plugins.names.push(name); + }; + + /** + * Initializes a plugin. + * + * @param {string} name + */ + Interface.prototype.require = function(name) { + var self = this; + var plugins = self.plugins; + + if (!self.plugins.loaded.hasOwnProperty(name)) { + if (plugins.requested[name]) { + throw new Error('Plugin has circular dependency ("' + name + '")'); + } + self.loadPlugin(name); + } + + return plugins.loaded[name]; + }; + + /** + * Registers a plugin. + * + * @param {string} name + * @param {function} fn + */ + Interface.define = function(name, fn) { + Interface.plugins[name] = { + 'name' : name, + 'fn' : fn + }; + }; + }; + + var utils = { + isArray: Array.isArray || function(vArg) { + return Object.prototype.toString.call(vArg) === '[object Array]'; + } + }; + + return MicroPlugin; +})); + +/** + * selectize.js (v0.7.7) + * Copyright (c) 2013 Brian Reavis & contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at: + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF + * ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + * + * @author Brian Reavis + */ + +/*jshint curly:false */ +/*jshint browser:true */ + +(function(root, factory) { + if (typeof define === 'function' && define.amd) { + define(['jquery','sifter','microplugin'], factory); + } else { + root.Selectize = factory(root.jQuery, root.Sifter, root.MicroPlugin); + } +}(this, function($, Sifter, MicroPlugin) { + 'use strict'; + + var highlight = function($element, pattern) { + if (typeof pattern === 'string' && !pattern.length) return; + var regex = (typeof pattern === 'string') ? new RegExp(pattern, 'i') : pattern; + + var highlight = function(node) { + var skip = 0; + if (node.nodeType === 3) { + var pos = node.data.search(regex); + if (pos >= 0 && node.data.length > 0) { + var match = node.data.match(regex); + var spannode = document.createElement('span'); + spannode.className = 'highlight'; + var middlebit = node.splitText(pos); + var endbit = middlebit.splitText(match[0].length); + var middleclone = middlebit.cloneNode(true); + spannode.appendChild(middleclone); + middlebit.parentNode.replaceChild(spannode, middlebit); + skip = 1; + } + } else if (node.nodeType === 1 && node.childNodes && !/(script|style)/i.test(node.tagName)) { + for (var i = 0; i < node.childNodes.length; ++i) { + i += highlight(node.childNodes[i]); + } + } + return skip; + }; + + return $element.each(function() { + highlight(this); + }); + }; + + var MicroEvent = function() {}; + MicroEvent.prototype = { + on: function(event, fct){ + this._events = this._events || {}; + this._events[event] = this._events[event] || []; + this._events[event].push(fct); + }, + off: function(event, fct){ + var n = arguments.length; + if (n === 0) return delete this._events; + if (n === 1) return delete this._events[event]; + + this._events = this._events || {}; + if (event in this._events === false) return; + this._events[event].splice(this._events[event].indexOf(fct), 1); + }, + trigger: function(event /* , args... */){ + this._events = this._events || {}; + if (event in this._events === false) return; + for (var i = 0; i < this._events[event].length; i++){ + this._events[event][i].apply(this, Array.prototype.slice.call(arguments, 1)); + } + } + }; + + /** + * Mixin will delegate all MicroEvent.js function in the destination object. + * + * - MicroEvent.mixin(Foobar) will make Foobar able to use MicroEvent + * + * @param {object} the object which will support MicroEvent + */ + MicroEvent.mixin = function(destObject){ + var props = ['on', 'off', 'trigger']; + for (var i = 0; i < props.length; i++){ + destObject.prototype[props[i]] = MicroEvent.prototype[props[i]]; + } + }; + + var IS_MAC = /Mac/.test(navigator.userAgent); + + var KEY_A = 65; + var KEY_COMMA = 188; + var KEY_RETURN = 13; + var KEY_ESC = 27; + var KEY_LEFT = 37; + var KEY_UP = 38; + var KEY_RIGHT = 39; + var KEY_DOWN = 40; + var KEY_BACKSPACE = 8; + var KEY_DELETE = 46; + var KEY_SHIFT = 16; + var KEY_CMD = IS_MAC ? 91 : 17; + var KEY_CTRL = IS_MAC ? 18 : 17; + var KEY_TAB = 9; + + var TAG_SELECT = 1; + var TAG_INPUT = 2; + + var isset = function(object) { + return typeof object !== 'undefined'; + }; + + /** + * Converts a scalar to its best string representation + * for hash keys and HTML attribute values. + * + * Transformations: + * 'str' -> 'str' + * null -> '' + * undefined -> '' + * true -> '1' + * false -> '0' + * 0 -> '0' + * 1 -> '1' + * + * @param {string} value + * @returns {string} + */ + var hash_key = function(value) { + if (typeof value === 'undefined' || value === null) return ''; + if (typeof value === 'boolean') return value ? '1' : '0'; + return value + ''; + }; + + /** + * Escapes a string for use within HTML. + * + * @param {string} str + * @returns {string} + */ + var escape_html = function(str) { + return (str + '') + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"'); + }; + + var hook = {}; + + /** + * Wraps `method` on `self` so that `fn` + * is invoked before the original method. + * + * @param {object} self + * @param {string} method + * @param {function} fn + */ + hook.before = function(self, method, fn) { + var original = self[method]; + self[method] = function() { + fn.apply(self, arguments); + return original.apply(self, arguments); + }; + }; + + /** + * Wraps `method` on `self` so that `fn` + * is invoked after the original method. + * + * @param {object} self + * @param {string} method + * @param {function} fn + */ + hook.after = function(self, method, fn) { + var original = self[method]; + self[method] = function() { + var result = original.apply(self, arguments); + fn.apply(self, arguments); + return result; + }; + }; + + /** + * Builds a hash table out of an array of + * objects, using the specified `key` within + * each object. + * + * @param {string} key + * @param {mixed} objects + */ + var build_hash_table = function(key, objects) { + if (!$.isArray(objects)) return objects; + var i, n, table = {}; + for (i = 0, n = objects.length; i < n; i++) { + if (objects[i].hasOwnProperty(key)) { + table[objects[i][key]] = objects[i]; + } + } + return table; + }; + + /** + * Wraps `fn` so that it can only be invoked once. + * + * @param {function} fn + * @returns {function} + */ + var once = function(fn) { + var called = false; + return function() { + if (called) return; + called = true; + fn.apply(this, arguments); + }; + }; + + /** + * Wraps `fn` so that it can only be called once + * every `delay` milliseconds (invoked on the falling edge). + * + * @param {function} fn + * @param {int} delay + * @returns {function} + */ + var debounce = function(fn, delay) { + var timeout; + return function() { + var self = this; + var args = arguments; + window.clearTimeout(timeout); + timeout = window.setTimeout(function() { + fn.apply(self, args); + }, delay); + }; + }; + + /** + * Debounce all fired events types listed in `types` + * while executing the provided `fn`. + * + * @param {object} self + * @param {array} types + * @param {function} fn + */ + var debounce_events = function(self, types, fn) { + var type; + var trigger = self.trigger; + var event_args = {}; + + // override trigger method + self.trigger = function() { + var type = arguments[0]; + if (types.indexOf(type) !== -1) { + event_args[type] = arguments; + } else { + return trigger.apply(self, arguments); + } + }; + + // invoke provided function + fn.apply(self, []); + self.trigger = trigger; + + // trigger queued events + for (type in event_args) { + if (event_args.hasOwnProperty(type)) { + trigger.apply(self, event_args[type]); + } + } + }; + + /** + * A workaround for http://bugs.jquery.com/ticket/6696 + * + * @param {object} $parent - Parent element to listen on. + * @param {string} event - Event name. + * @param {string} selector - Descendant selector to filter by. + * @param {function} fn - Event handler. + */ + var watchChildEvent = function($parent, event, selector, fn) { + $parent.on(event, selector, function(e) { + var child = e.target; + while (child && child.parentNode !== $parent[0]) { + child = child.parentNode; + } + e.currentTarget = child; + return fn.apply(this, [e]); + }); + }; + + /** + * Determines the current selection within a text input control. + * Returns an object containing: + * - start + * - length + * + * @param {object} input + * @returns {object} + */ + var getSelection = function(input) { + var result = {}; + if ('selectionStart' in input) { + result.start = input.selectionStart; + result.length = input.selectionEnd - result.start; + } else if (document.selection) { + input.focus(); + var sel = document.selection.createRange(); + var selLen = document.selection.createRange().text.length; + sel.moveStart('character', -input.value.length); + result.start = sel.text.length - selLen; + result.length = selLen; + } + return result; + }; + + /** + * Copies CSS properties from one element to another. + * + * @param {object} $from + * @param {object} $to + * @param {array} properties + */ + var transferStyles = function($from, $to, properties) { + var i, n, styles = {}; + if (properties) { + for (i = 0, n = properties.length; i < n; i++) { + styles[properties[i]] = $from.css(properties[i]); + } + } else { + styles = $from.css(); + } + $to.css(styles); + }; + + /** + * Measures the width of a string within a + * parent element (in pixels). + * + * @param {string} str + * @param {object} $parent + * @returns {int} + */ + var measureString = function(str, $parent) { + var $test = $('').css({ + position: 'absolute', + top: -99999, + left: -99999, + width: 'auto', + padding: 0, + whiteSpace: 'nowrap' + }).text(str).appendTo('body'); + + transferStyles($parent, $test, [ + 'letterSpacing', + 'fontSize', + 'fontFamily', + 'fontWeight', + 'textTransform' + ]); + + var width = $test.width(); + $test.remove(); + + return width; + }; + + /** + * Sets up an input to grow horizontally as the user + * types. If the value is changed manually, you can + * trigger the "update" handler to resize: + * + * $input.trigger('update'); + * + * @param {object} $input + */ + var autoGrow = function($input) { + var update = function(e) { + var value, keyCode, printable, placeholder, width; + var shift, character, selection; + e = e || window.event || {}; + + if (e.metaKey || e.altKey) return; + if ($input.data('grow') === false) return; + + value = $input.val(); + if (e.type && e.type.toLowerCase() === 'keydown') { + keyCode = e.keyCode; + printable = ( + (keyCode >= 97 && keyCode <= 122) || // a-z + (keyCode >= 65 && keyCode <= 90) || // A-Z + (keyCode >= 48 && keyCode <= 57) || // 0-9 + keyCode === 32 // space + ); + + if (keyCode === KEY_DELETE || keyCode === KEY_BACKSPACE) { + selection = getSelection($input[0]); + if (selection.length) { + value = value.substring(0, selection.start) + value.substring(selection.start + selection.length); + } else if (keyCode === KEY_BACKSPACE && selection.start) { + value = value.substring(0, selection.start - 1) + value.substring(selection.start + 1); + } else if (keyCode === KEY_DELETE && typeof selection.start !== 'undefined') { + value = value.substring(0, selection.start) + value.substring(selection.start + 1); + } + } else if (printable) { + shift = e.shiftKey; + character = String.fromCharCode(e.keyCode); + if (shift) character = character.toUpperCase(); + else character = character.toLowerCase(); + value += character; + } + } + + placeholder = $input.attr('placeholder') || ''; + if (!value.length && placeholder.length) { + value = placeholder; + } + + width = measureString(value, $input) + 4; + if (width !== $input.width()) { + $input.width(width); + $input.triggerHandler('resize'); + } + }; + + $input.on('keydown keyup update blur', update); + update(); + }; + + var Selectize = function($input, settings) { + var key, i, n, self = this; + $input[0].selectize = self; + + // setup default state + $.extend(self, { + settings : settings, + $input : $input, + tagType : $input[0].tagName.toLowerCase() === 'select' ? TAG_SELECT : TAG_INPUT, + + eventNS : '.selectize' + (++Selectize.count), + highlightedValue : null, + isOpen : false, + isDisabled : false, + isLocked : false, + isFocused : false, + isInputFocused : false, + isInputHidden : false, + isSetup : false, + isShiftDown : false, + isCmdDown : false, + isCtrlDown : false, + ignoreFocus : false, + ignoreHover : false, + hasOptions : false, + currentResults : null, + lastValue : '', + caretPos : 0, + loading : 0, + loadedSearches : {}, + + $activeOption : null, + $activeItems : [], + + optgroups : {}, + options : {}, + userOptions : {}, + items : [], + renderCache : {}, + onSearchChange : debounce(self.onSearchChange, settings.loadThrottle) + }); + + // search system + self.sifter = new Sifter(this.options, {diacritics: settings.diacritics}); + + // build options table + $.extend(self.options, build_hash_table(settings.valueField, settings.options)); + delete self.settings.options; + + // build optgroup table + $.extend(self.optgroups, build_hash_table(settings.optgroupValueField, settings.optgroups)); + delete self.settings.optgroups; + + // option-dependent defaults + self.settings.mode = self.settings.mode || (self.settings.maxItems === 1 ? 'single' : 'multi'); + if (typeof self.settings.hideSelected !== 'boolean') { + self.settings.hideSelected = self.settings.mode === 'multi'; + } + + self.initializePlugins(self.settings.plugins); + self.setupCallbacks(); + self.setupTemplates(); + self.setup(); + }; + + // mixins + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + MicroEvent.mixin(Selectize); + MicroPlugin.mixin(Selectize); + + // methods + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + $.extend(Selectize.prototype, { + + /** + * Creates all elements and sets up event bindings. + */ + setup: function() { + var self = this; + var settings = self.settings; + var eventNS = self.eventNS; + var $window = $(window); + var $document = $(document); + + var $wrapper; + var $control; + var $control_input; + var $dropdown; + var $dropdown_content; + var $dropdown_parent; + var inputMode; + var timeout_blur; + var timeout_focus; + var tab_index; + var classes; + var classes_plugins; + + inputMode = self.settings.mode; + tab_index = self.$input.attr('tabindex') || ''; + classes = self.$input.attr('class') || ''; + + $wrapper = $('
').addClass(settings.wrapperClass).addClass(classes).addClass(inputMode); + $control = $('
').addClass(settings.inputClass).addClass('items').appendTo($wrapper); + $control_input = $('').appendTo($control).attr('tabindex', tab_index); + $dropdown_parent = $(settings.dropdownParent || $wrapper); + $dropdown = $('
').addClass(settings.dropdownClass).addClass(classes).addClass(inputMode).hide().appendTo($dropdown_parent); + $dropdown_content = $('
').addClass(settings.dropdownContentClass).appendTo($dropdown); + + $wrapper.css({ + width: self.$input[0].style.width + }); + + if (self.plugins.names.length) { + classes_plugins = 'plugin-' + self.plugins.names.join(' plugin-'); + $wrapper.addClass(classes_plugins); + $dropdown.addClass(classes_plugins); + } + + if ((settings.maxItems === null || settings.maxItems > 1) && self.tagType === TAG_SELECT) { + self.$input.attr('multiple', 'multiple'); + } + + if (self.settings.placeholder) { + $control_input.attr('placeholder', settings.placeholder); + } + + self.$wrapper = $wrapper; + self.$control = $control; + self.$control_input = $control_input; + self.$dropdown = $dropdown; + self.$dropdown_content = $dropdown_content; + + $control.on('mousedown', function(e) { + if (!e.isDefaultPrevented()) { + window.setTimeout(function() { + self.focus(true); + }, 0); + } + }); + + // necessary for mobile webkit devices (manual focus triggering + // is ignored unless invoked within a click event) + $control.on('click', function(e) { + if (!self.isInputFocused) { + self.focus(true); + } + }); + + $dropdown.on('mouseenter', '[data-selectable]', function() { return self.onOptionHover.apply(self, arguments); }); + $dropdown.on('mousedown', '[data-selectable]', function() { return self.onOptionSelect.apply(self, arguments); }); + watchChildEvent($control, 'mousedown', '*:not(input)', function() { return self.onItemSelect.apply(self, arguments); }); + autoGrow($control_input); + + $control_input.on({ + mousedown : function(e) { e.stopPropagation(); }, + keydown : function() { return self.onKeyDown.apply(self, arguments); }, + keyup : function() { return self.onKeyUp.apply(self, arguments); }, + keypress : function() { return self.onKeyPress.apply(self, arguments); }, + resize : function() { self.positionDropdown.apply(self, []); }, + blur : function() { return self.onBlur.apply(self, arguments); }, + focus : function() { return self.onFocus.apply(self, arguments); } + }); + + $document.on('keydown' + eventNS, function(e) { + self.isCmdDown = e[IS_MAC ? 'metaKey' : 'ctrlKey']; + self.isCtrlDown = e[IS_MAC ? 'altKey' : 'ctrlKey']; + self.isShiftDown = e.shiftKey; + }); + + $document.on('keyup' + eventNS, function(e) { + if (e.keyCode === KEY_CTRL) self.isCtrlDown = false; + if (e.keyCode === KEY_SHIFT) self.isShiftDown = false; + if (e.keyCode === KEY_CMD) self.isCmdDown = false; + }); + + $document.on('mousedown' + eventNS, function(e) { + if (self.isFocused) { + // prevent events on the dropdown scrollbar from causing the control to blur + if (e.target === self.$dropdown[0] || e.target.parentNode === self.$dropdown[0]) { + var ignoreFocus = self.ignoreFocus; + self.ignoreFocus = true; + window.setTimeout(function() { + self.ignoreFocus = ignoreFocus; + self.focus(false); + }, 0); + return; + } + // blur on click outside + if (!self.$control.has(e.target).length && e.target !== self.$control[0]) { + self.blur(); + } + } + }); + + $window.on(['scroll' + eventNS, 'resize' + eventNS].join(' '), function() { + if (self.isOpen) { + self.positionDropdown.apply(self, arguments); + } + }); + $window.on('mousemove' + eventNS, function() { + self.ignoreHover = false; + }); + + self.$input.attr('tabindex',-1).hide().after(self.$wrapper); + + if ($.isArray(settings.items)) { + self.setValue(settings.items); + delete settings.items; + } + + self.updateOriginalInput(); + self.refreshItems(); + self.refreshClasses(); + self.updatePlaceholder(); + self.isSetup = true; + + if (self.$input.is(':disabled')) { + self.disable(); + } + + self.on('change', this.onChange); + self.trigger('initialize'); + + // preload options + if (settings.preload) { + self.onSearchChange(''); + } + }, + + /** + * Sets up default rendering functions. + */ + setupTemplates: function() { + var self = this; + var field_label = self.settings.labelField; + var field_optgroup = self.settings.optgroupLabelField; + + var templates = { + 'optgroup': function(data) { + return '
' + data.html + '
'; + }, + 'optgroup_header': function(data, escape) { + return '
' + escape(data[field_optgroup]) + '
'; + }, + 'option': function(data, escape) { + return '
' + escape(data[field_label]) + '
'; + }, + 'item': function(data, escape) { + return '
' + escape(data[field_label]) + '
'; + }, + 'option_create': function(data, escape) { + return '
Add ' + escape(data.input) + '
'; + }, + }; + + self.settings.render = $.extend({}, templates, self.settings.render); + }, + + /** + * Maps fired events to callbacks provided + * in the settings used when creating the control. + */ + setupCallbacks: function() { + var key, fn, callbacks = { + 'initialize' : 'onInitialize', + 'change' : 'onChange', + 'item_add' : 'onItemAdd', + 'item_remove' : 'onItemRemove', + 'clear' : 'onClear', + 'option_add' : 'onOptionAdd', + 'option_remove' : 'onOptionRemove', + 'option_clear' : 'onOptionClear', + 'dropdown_open' : 'onDropdownOpen', + 'dropdown_close' : 'onDropdownClose', + 'type' : 'onType' + }; + + for (key in callbacks) { + if (callbacks.hasOwnProperty(key)) { + fn = this.settings[callbacks[key]]; + if (fn) this.on(key, fn); + } + } + }, + + /** + * Triggered when the value of the control has been changed. + * This should propagate the event to the original DOM + * input / select element. + */ + onChange: function() { + this.$input.trigger('change'); + }, + + /** + * Triggered on keypress. + * + * @param {object} e + * @returns {boolean} + */ + onKeyPress: function(e) { + if (this.isLocked) return e && e.preventDefault(); + var character = String.fromCharCode(e.keyCode || e.which); + if (this.settings.create && character === this.settings.delimiter) { + this.createItem(); + e.preventDefault(); + return false; + } + }, + + /** + * Triggered on keydown. + * + * @param {object} e + * @returns {boolean} + */ + onKeyDown: function(e) { + var isInput = e.target === this.$control_input[0]; + var self = this; + + if (self.isLocked) { + if (e.keyCode !== KEY_TAB) { + e.preventDefault(); + } + return; + } + + switch (e.keyCode) { + case KEY_A: + if (self.isCmdDown) { + self.selectAll(); + return; + } + break; + case KEY_ESC: + self.blur(); + return; + case KEY_DOWN: + if (!self.isOpen && self.hasOptions) { + self.open(); + } else if (self.$activeOption) { + self.ignoreHover = true; + var $next = self.getAdjacentOption(self.$activeOption, 1); + if ($next.length) self.setActiveOption($next, true, true); + } + e.preventDefault(); + return; + case KEY_UP: + if (self.$activeOption) { + self.ignoreHover = true; + var $prev = self.getAdjacentOption(self.$activeOption, -1); + if ($prev.length) self.setActiveOption($prev, true, true); + } + e.preventDefault(); + return; + case KEY_RETURN: + if (self.$activeOption) { + self.onOptionSelect({currentTarget: self.$activeOption}); + } + e.preventDefault(); + return; + case KEY_LEFT: + self.advanceSelection(-1, e); + return; + case KEY_RIGHT: + self.advanceSelection(1, e); + return; + case KEY_TAB: + if (self.settings.create && $.trim(self.$control_input.val()).length) { + self.createItem(); + e.preventDefault(); + } + return; + case KEY_BACKSPACE: + case KEY_DELETE: + self.deleteSelection(e); + return; + } + if (self.isFull() || self.isInputHidden) { + e.preventDefault(); + return; + } + }, + + /** + * Triggered on keyup. + * + * @param {object} e + * @returns {boolean} + */ + onKeyUp: function(e) { + var self = this; + + if (self.isLocked) return e && e.preventDefault(); + var value = self.$control_input.val() || ''; + if (self.lastValue !== value) { + self.lastValue = value; + self.onSearchChange(value); + self.refreshOptions(); + self.trigger('type', value); + } + }, + + /** + * Invokes the user-provide option provider / loader. + * + * Note: this function is debounced in the Selectize + * constructor (by `settings.loadDelay` milliseconds) + * + * @param {string} value + */ + onSearchChange: function(value) { + var self = this; + var fn = self.settings.load; + if (!fn) return; + if (self.loadedSearches.hasOwnProperty(value)) return; + self.loadedSearches[value] = true; + self.load(function(callback) { + fn.apply(self, [value, callback]); + }); + }, + + /** + * Triggered on focus. + * + * @param {object} e (optional) + * @returns {boolean} + */ + onFocus: function(e) { + var self = this; + + self.isInputFocused = true; + self.isFocused = true; + if (self.isDisabled) { + self.blur(); + e.preventDefault(); + return false; + } + + if (self.ignoreFocus) return; + if (self.settings.preload === 'focus') self.onSearchChange(''); + + self.showInput(); + self.setActiveItem(null); + self.refreshOptions(!!self.settings.openOnFocus); + self.refreshClasses(); + }, + + /** + * Triggered on blur. + * + * @param {object} e + * @returns {boolean} + */ + onBlur: function(e) { + var self = this; + self.isInputFocused = false; + if (self.ignoreFocus) return; + + self.close(); + self.setTextboxValue(''); + self.setActiveItem(null); + self.setActiveOption(null); + self.setCaret(self.items.length); + self.isFocused = false; + self.refreshClasses(); + }, + + /** + * Triggered when the user rolls over + * an option in the autocomplete dropdown menu. + * + * @param {object} e + * @returns {boolean} + */ + onOptionHover: function(e) { + if (this.ignoreHover) return; + this.setActiveOption(e.currentTarget, false); + }, + + /** + * Triggered when the user clicks on an option + * in the autocomplete dropdown menu. + * + * @param {object} e + * @returns {boolean} + */ + onOptionSelect: function(e) { + var value, $target, $option, self = this; + + e.preventDefault && e.preventDefault(); + e.stopPropagation && e.stopPropagation(); + self.focus(false); + + $target = $(e.currentTarget); + if ($target.hasClass('create')) { + self.createItem(); + } else { + value = $target.attr('data-value'); + if (value) { + self.setTextboxValue(''); + self.addItem(value); + if (!self.settings.hideSelected && e.type && /mouse/.test(e.type)) { + self.setActiveOption(self.getOption(value)); + } + } + } + }, + + /** + * Triggered when the user clicks on an item + * that has been selected. + * + * @param {object} e + * @returns {boolean} + */ + onItemSelect: function(e) { + var self = this; + + if (self.settings.mode === 'multi') { + e.preventDefault(); + self.setActiveItem(e.currentTarget, e); + self.focus(false); + self.hideInput(); + } + }, + + /** + * Invokes the provided method that provides + * results to a callback---which are then added + * as options to the control. + * + * @param {function} fn + */ + load: function(fn) { + var self = this; + var $wrapper = self.$wrapper.addClass('loading'); + + self.loading++; + fn.apply(self, [function(results) { + self.loading = Math.max(self.loading - 1, 0); + if (results && results.length) { + self.addOption(results); + self.refreshOptions(false); + if (self.isInputFocused) self.open(); + } + if (!self.loading) { + $wrapper.removeClass('loading'); + } + self.trigger('load', results); + }]); + }, + + /** + * Sets the input field of the control to the specified value. + * + * @param {string} value + */ + setTextboxValue: function(value) { + this.$control_input.val(value).triggerHandler('update'); + this.lastValue = value; + }, + + /** + * Returns the value of the control. If multiple items + * can be selected (e.g. or + * element to reflect the current state. + */ + updateOriginalInput: function() { + var i, n, options, self = this; + + if (self.$input[0].tagName.toLowerCase() === 'select') { + options = []; + for (i = 0, n = self.items.length; i < n; i++) { + options.push(''); + } + if (!options.length && !this.$input.attr('multiple')) { + options.push(''); + } + self.$input.html(options.join('')); + } else { + self.$input.val(self.getValue()); + } + + if (self.isSetup) { + self.trigger('change', self.$input.val()); + } + }, + + /** + * Shows/hide the input placeholder depending + * on if there items in the list already. + */ + updatePlaceholder: function() { + if (!this.settings.placeholder) return; + var $input = this.$control_input; + + if (this.items.length) { + $input.removeAttr('placeholder'); + } else { + $input.attr('placeholder', this.settings.placeholder); + } + $input.triggerHandler('update'); + }, + + /** + * Shows the autocomplete dropdown containing + * the available options. + */ + open: function() { + var self = this; + + if (self.isLocked || self.isOpen || (self.settings.mode === 'multi' && self.isFull())) return; + self.focus(true); + self.isOpen = true; + self.refreshClasses(); + self.$dropdown.css({visibility: 'hidden', display: 'block'}); + self.positionDropdown(); + self.$dropdown.css({visibility: 'visible'}); + self.trigger('dropdown_open', this.$dropdown); + }, + + /** + * Closes the autocomplete dropdown menu. + */ + close: function() { + var self = this; + + if (!self.isOpen) return; + self.$dropdown.hide(); + self.setActiveOption(null); + self.isOpen = false; + self.refreshClasses(); + self.trigger('dropdown_close', self.$dropdown); + }, + + /** + * Calculates and applies the appropriate + * position of the dropdown. + */ + positionDropdown: function() { + var $control = this.$control; + var offset = this.settings.dropdownParent === 'body' ? $control.offset() : $control.position(); + offset.top += $control.outerHeight(true); + + this.$dropdown.css({ + width : $control.outerWidth(), + top : offset.top, + left : offset.left + }); + }, + + /** + * Resets / clears all selected items + * from the control. + */ + clear: function() { + var self = this; + + if (!self.items.length) return; + self.$control.children(':not(input)').remove(); + self.items = []; + self.setCaret(0); + self.updatePlaceholder(); + self.updateOriginalInput(); + self.refreshClasses(); + self.showInput(); + self.trigger('clear'); + }, + + /** + * A helper method for inserting an element + * at the current caret position. + * + * @param {object} $el + */ + insertAtCaret: function($el) { + var caret = Math.min(this.caretPos, this.items.length); + if (caret === 0) { + this.$control.prepend($el); + } else { + $(this.$control[0].childNodes[caret]).before($el); + } + this.setCaret(caret + 1); + }, + + /** + * Removes the current selected item(s). + * + * @param {object} e (optional) + * @returns {boolean} + */ + deleteSelection: function(e) { + var i, n, direction, selection, values, caret, option_select, $option_select, $tail; + var self = this; + + direction = (e && e.keyCode === KEY_BACKSPACE) ? -1 : 1; + selection = getSelection(self.$control_input[0]); + + if (self.$activeOption && !self.settings.hideSelected) { + option_select = self.getAdjacentOption(self.$activeOption, -1).attr('data-value'); + } + + // determine items that will be removed + values = []; + + if (self.$activeItems.length) { + $tail = self.$control.children('.active:' + (direction > 0 ? 'last' : 'first')); + caret = self.$control.children(':not(input)').index($tail); + if (direction > 0) { caret++; } + + for (i = 0, n = self.$activeItems.length; i < n; i++) { + values.push($(self.$activeItems[i]).attr('data-value')); + } + if (e) { + e.preventDefault(); + e.stopPropagation(); + } + } else if ((self.isFocused || self.settings.mode === 'single') && self.items.length) { + if (direction < 0 && selection.start === 0 && selection.length === 0) { + values.push(self.items[self.caretPos - 1]); + } else if (direction > 0 && selection.start === self.$control_input.val().length) { + values.push(self.items[self.caretPos]); + } + } + + // allow the callback to abort + if (!values.length || (typeof self.settings.onDelete === 'function' && self.settings.onDelete.apply(self, [values]) === false)) { + return false; + } + + // perform removal + if (typeof caret !== 'undefined') { + self.setCaret(caret); + } + while (values.length) { + self.removeItem(values.pop()); + } + + self.showInput(); + self.refreshOptions(true); + + // select previous option + if (option_select) { + $option_select = self.getOption(option_select); + if ($option_select.length) { + self.setActiveOption($option_select); + } + } + + return true; + }, + + /** + * Selects the previous / next item (depending + * on the `direction` argument). + * + * > 0 - right + * < 0 - left + * + * @param {int} direction + * @param {object} e (optional) + */ + advanceSelection: function(direction, e) { + var tail, selection, idx, valueLength, cursorAtEdge, $tail; + var self = this; + + if (direction === 0) return; + + tail = direction > 0 ? 'last' : 'first'; + selection = getSelection(self.$control_input[0]); + + if (self.isInputFocused && !self.isInputHidden) { + valueLength = self.$control_input.val().length; + cursorAtEdge = direction < 0 + ? selection.start === 0 && selection.length === 0 + : selection.start === valueLength; + + if (cursorAtEdge && !valueLength) { + self.advanceCaret(direction, e); + } + } else { + $tail = self.$control.children('.active:' + tail); + if ($tail.length) { + idx = self.$control.children(':not(input)').index($tail); + self.setActiveItem(null); + self.setCaret(direction > 0 ? idx + 1 : idx); + self.showInput(); + } + } + }, + + /** + * Moves the caret left / right. + * + * @param {int} direction + * @param {object} e (optional) + */ + advanceCaret: function(direction, e) { + if (direction === 0) return; + var self = this; + var fn = direction > 0 ? 'next' : 'prev'; + if (self.isShiftDown) { + var $adj = self.$control_input[fn](); + if ($adj.length) { + self.hideInput(); + self.setActiveItem($adj); + e && e.preventDefault(); + } + } else { + self.setCaret(self.caretPos + direction); + } + }, + + /** + * Moves the caret to the specified index. + * + * @param {int} i + */ + setCaret: function(i) { + var self = this; + + if (self.settings.mode === 'single') { + i = self.items.length; + } else { + i = Math.max(0, Math.min(self.items.length, i)); + } + + // the input must be moved by leaving it in place and moving the + // siblings, due to the fact that focus cannot be restored once lost + // on mobile webkit devices + var j, n, fn, $children, $child; + $children = self.$control.children(':not(input)'); + for (j = 0, n = $children.length; j < n; j++) { + $child = $($children[j]).detach(); + if (j < i) { + self.$control_input.before($child); + } else { + self.$control.append($child); + } + } + + self.caretPos = i; + }, + + /** + * Disables user input on the control. Used while + * items are being asynchronously created. + */ + lock: function() { + this.close(); + this.isLocked = true; + this.refreshClasses(); + }, + + /** + * Re-enables user input on the control. + */ + unlock: function() { + this.isLocked = false; + this.refreshClasses(); + }, + + /** + * Disables user input on the control completely. + * While disabled, it cannot receive focus. + */ + disable: function() { + var self = this; + self.$input.prop('disabled', true); + self.isDisabled = true; + self.lock(); + }, + + /** + * Enables the control so that it can respond + * to focus and user input. + */ + enable: function() { + var self = this; + self.$input.prop('disabled', false); + self.isDisabled = false; + self.unlock(); + }, + + /** + * Completely destroys the control and + * unbinds all event listeners so that it can + * be garbage collected. + */ + destroy: function() { + var self = this; + var eventNS = self.eventNS; + + self.trigger('destroy'); + self.off(); + self.$wrapper.remove(); + self.$dropdown.remove(); + self.$input.show(); + + $(window).off(eventNS); + $(document).off(eventNS); + $(document.body).off(eventNS); + + delete self.$input[0].selectize; + }, + + /** + * A helper method for rendering "item" and + * "option" templates, given the data. + * + * @param {string} templateName + * @param {object} data + * @returns {string} + */ + render: function(templateName, data) { + var value, id, label; + var html = ''; + var cache = false; + var self = this; + var regex_tag = /^[\t ]*<([a-z][a-z0-9\-_]*(?:\:[a-z][a-z0-9\-_]*)?)/i; + + if (templateName === 'option' || templateName === 'item') { + value = hash_key(data[self.settings.valueField]); + cache = !!value; + } + + // pull markup from cache if it exists + if (cache) { + if (!isset(self.renderCache[templateName])) { + self.renderCache[templateName] = {}; + } + if (self.renderCache[templateName].hasOwnProperty(value)) { + return self.renderCache[templateName][value]; + } + } + + // render markup + html = self.settings.render[templateName].apply(this, [data, escape_html]); + + // add mandatory attributes + if (templateName === 'option' || templateName === 'option_create') { + html = html.replace(regex_tag, '<$1 data-selectable'); + } + if (templateName === 'optgroup') { + id = data[self.settings.optgroupValueField] || ''; + html = html.replace(regex_tag, '<$1 data-group="' + escape_html(id) + '"'); + } + if (templateName === 'option' || templateName === 'item') { + html = html.replace(regex_tag, '<$1 data-value="' + escape_html(value || '') + '"'); + } + + // update cache + if (cache) { + self.renderCache[templateName][value] = html; + } + + return html; + } + + }); + + Selectize.count = 0; + Selectize.defaults = { + plugins: [], + delimiter: ',', + persist: true, + diacritics: true, + create: false, + highlight: true, + openOnFocus: true, + maxOptions: 1000, + maxItems: null, + hideSelected: null, + preload: false, + + scrollDuration: 60, + loadThrottle: 300, + + dataAttr: 'data-data', + optgroupField: 'optgroup', + sortField: '$order', + sortDirection: 'asc', + valueField: 'value', + labelField: 'text', + optgroupLabelField: 'label', + optgroupValueField: 'value', + optgroupOrder: null, + searchField: ['text'], + + mode: null, + wrapperClass: 'selectize-control', + inputClass: 'selectize-input', + dropdownClass: 'selectize-dropdown', + dropdownContentClass: 'selectize-dropdown-content', + + dropdownParent: null, + + /* + load : null, // function(query, callback) { ... } + score : null, // function(search) { ... } + onInitialize : null, // function() { ... } + onChange : null, // function(value) { ... } + onItemAdd : null, // function(value, $item) { ... } + onItemRemove : null, // function(value) { ... } + onClear : null, // function() { ... } + onOptionAdd : null, // function(value, data) { ... } + onOptionRemove : null, // function(value) { ... } + onOptionClear : null, // function() { ... } + onDropdownOpen : null, // function($dropdown) { ... } + onDropdownClose : null, // function($dropdown) { ... } + onType : null, // function(str) { ... } + onDelete : null, // function(values) { ... } + */ + + render: { + /* + item: null, + optgroup: null, + optgroup_header: null, + option: null, + option_create: null + */ + } + }; + + $.fn.selectize = function(settings) { + settings = settings || {}; + + var defaults = $.fn.selectize.defaults; + var dataAttr = settings.dataAttr || defaults.dataAttr; + + /** + * Initializes selectize from a element. + * + * @param {object} $input + * @param {object} settings + */ + var init_textbox = function($input, settings_element) { + console.log('init_textbox',$input.val()) + var i, n, values, value = $.trim($input.val() || ''); + if (!value.length) return; + + values = value.split(settings.delimiter || defaults.delimiter); + for (i = 0, n = values.length; i < n; i++) { + settings_element.options[values[i]] = { + 'text' : values[i], + 'value' : values[i] + }; + } + + settings_element.items = values; + }; + + /** + * Initializes selectize from a

+ +
+

+ +
+

+ +
+

+ +
+
+ +
+
+ +
+

+ +
+ + + + + {% with messages = get_flashed_messages(with_categories=true) %} + {% if messages %} + {% for type,message in messages %} + {% if type == 'success' %} +
+ {{ message }} +
+ {% elif type == 'warning' %} +
+ {{ message }} +
+ {% else %} +
+ {{ message }} +
+ {% endif %} + {% endfor %} + {% endif %} + {% endwith %} + + + \ No newline at end of file diff --git a/backend/templates/admin.html b/backend/templates/admin.html index af9c2f6..908d7e3 100644 --- a/backend/templates/admin.html +++ b/backend/templates/admin.html @@ -12,40 +12,42 @@ - + -
-{% with messages = get_flashed_messages(with_categories=true) %} - {% if messages %} - {% for type,message in messages %} - {% if type == 'success' %} -
- {{ message }} -
- {% else%} -
- {{ message }} -
- {% endif %} - {% endfor %} - {% else %} -
- {% endif %} -{% endwith %} +
+ {% with messages = get_flashed_messages(with_categories=true) %} + {% if messages %} + {% for type,message in messages %} + {% if type == 'success' %} +
+ {{ message }} +
+ {% else%} +
+ {{ message }} +
+ {% endif %} + {% endfor %} + {% else %} +
+ {% endif %} + {% endwith %} -
- -Force restart?
-
+
+ + Force restart?
+ +
- + \ No newline at end of file diff --git a/backend/templates/admin_home.html b/backend/templates/admin_home.html index 948d7f3..8dcabd0 100644 --- a/backend/templates/admin_home.html +++ b/backend/templates/admin_home.html @@ -35,123 +35,124 @@ - + -
-{% with messages = get_flashed_messages(with_categories=true) %} - {% if messages %} - {% for type,message in messages %} - {% if type == 'success' %} -
- {{ message }} -
- {% else%} -
- {{ message }} -
- {% endif %} - {% endfor %} - {% else %}
- {% endif %} -{% endwith %} + {% with messages = get_flashed_messages(with_categories=true) %} + {% if messages %} + {% for type,message in messages %} + {% if type == 'success' %} +
+ {{ message }} +
+ {% else%} +
+ {{ message }} +
+ {% endif %} + {% endfor %} + {% else %} +
+ {% endif %} + {% endwith %} -{% if user_handshakes|length > 0 %} - {% for user_hs_pair in user_handshakes %} -
-

{{ user_hs_pair[0] }}

+ {% if user_handshakes|length > 0 %} + {% for user_hs_pair in user_handshakes %} +
+

{{ user_hs_pair[0] }}

-
-
-

- - +
+ +

+ + - - + -

- -
+ +

+ +
- {% if user_hs_pair[1]["uncracked"]|length > 0 %} - - - - - - - - - - {% for handshake in user_hs_pair[1]["uncracked"] %} - - - - - - - - - {% endfor %} -
Wifi NameWifi MACHS TypeDate AddedTried rulesEstimated time
{{ handshake["ssid"] }}{{ handshake["mac"] }}{{ handshake["hs_type"] }}{{ handshake["date_added"] }}{{ handshake["tried_rules"] }}{{ handshake["eta"] }}
- {% else %} -

No uncracked handshakes

- {% endif %} - {% if user_hs_pair[1]["cracked"]|length > 0 %} -
- - - - - - - - - - - {% for handshake in user_hs_pair[1]["cracked"] %} - - - - - - - - - - {% endfor %} -
Wifi NameWifi MACHS TypeDate AddedCracked byPasswordDate Cracked
{{ handshake["ssid"] }}{{ handshake["mac"] }}{{ handshake["hs_type"] }}{{ handshake["date_added"] }}{{ handshake["cracked_by"] }}{{ handshake["password"] }}{{ handshake["date"] }}
- {% endif %} -
-
- {% endfor %} -{% else %} -

Get some users

-{% endif %} + {% if user_hs_pair[1]["uncracked"]|length > 0 %} + + + + + + + + + + {% for handshake in user_hs_pair[1]["uncracked"] %} + + + + + + + + + {% endfor %} +
Wifi NameWifi MACHS TypeDate AddedTried rulesEstimated time
{{ handshake["ssid"] }}{{ handshake["mac"] }}{{ handshake["hs_type"] }}{{ handshake["date_added"] }}{{ handshake["tried_rules"] }}{{ handshake["eta"] }}
+ {% else %} +

No uncracked handshakes

+ {% endif %} + {% if user_hs_pair[1]["cracked"]|length > 0 %} +
+ + + + + + + + + + + {% for handshake in user_hs_pair[1]["cracked"] %} + + + + + + + + + + {% endfor %} +
Wifi NameWifi MACHS TypeDate AddedCracked byPasswordDate Cracked
{{ handshake["ssid"] }}{{ handshake["mac"] }}{{ handshake["hs_type"] }}{{ handshake["date_added"] }}{{ handshake["cracked_by"] }}{{ handshake["password"] }}{{ handshake["date"] }}
+ {% endif %} +
+
+ {% endfor %} + {% else %} +

Get some users

+ {% endif %} diff --git a/backend/templates/edit_rule.html b/backend/templates/edit_rule.html new file mode 100644 index 0000000..6db408b --- /dev/null +++ b/backend/templates/edit_rule.html @@ -0,0 +1,168 @@ + + + + + + + PSKnow + + + + + + + + + + +
+ +
+

Edit the rule

+
+ +

+ +
+

+ +
+

+ +
+

+ +
+
+ +
+
+ +
+

+ + + + + +
+

+ + +


+
+

+
+ + + + + + + + + {% with messages = get_flashed_messages(with_categories=true) %} + {% if messages %} + {% for type,message in messages %} + {% if type == 'success' %} +
+ {{ message }} +
+ {% else%} +
+ {{ message }} +
+ {% endif %} + {% endfor %} + {% else %} +
+ {% endif %} + {% endwith %} + + + \ No newline at end of file diff --git a/backend/templates/rules.html b/backend/templates/rules.html new file mode 100644 index 0000000..4015f04 --- /dev/null +++ b/backend/templates/rules.html @@ -0,0 +1,120 @@ + + + + PSKnow + + + + + + + + + + + + + + + +
+{% with messages = get_flashed_messages(with_categories=true) %} + {% if messages %} + {% for type,message in messages %} + {% if type == 'success' %} +
+ {{ message }} +
+ {% else%} +
+ {{ message }} +
+ {% endif %} + {% endfor %} + {% else %} +
+ {% endif %} +{% endwith %} + +

Rules management:

+ +


+
+ +{% if rules|length > 0 %} + + + + + + + + + {% for rule in rules %} + + + + + + + + + + + + + + + + + + + {% endfor %} +
PriorityNameInfoEditDelete
{{ rule["priority"] }}{{ rule["name"] }} +
+ + +
+
+
+ + +
+
+{% else %} +

No rules loaded at the moment

+{% endif %} + + + + \ No newline at end of file diff --git a/backend/templates/upload.html b/backend/templates/upload.html index c95b789..3bd6b0d 100644 --- a/backend/templates/upload.html +++ b/backend/templates/upload.html @@ -1,45 +1,88 @@ - PSKnow - - - - - - -
-
- -

Upload capture file(s)

-
- - -
- {% with messages = get_flashed_messages(with_categories=true) %} - {% if messages %} - {% for type,message in messages %} - {% if type == 'success' %} -
- {{ message }} -
- {% elif type == 'warning' %} -
- {{ message }} -
- {% else %} -
- {{ message }} -
- {% endif %} - {% endfor %} - {% endif %} - {% endwith %} -
- - + PSKnow + + + + + + + + + + + + + + + + +

+ + +
+

Step 1: Double-click on the area where you took the capture file(s):

+ +

+
+

+ +

Step 2: (optional) Do you have any password suggestions for the capture file(s)? Insert them here!

+ +

+ +

Step 3: Select languages for capture file(s)

+ + + + + +

+ +

Step 4: Select capture file(s)

+ +

+ +

Step 5: Upload capture file(s)

+ + + {% with messages = get_flashed_messages(with_categories=true) %} + {% if messages %} + {% for type,message in messages %} + {% if type == 'success' %} +
+ {{ message }} +
+ {% elif type == 'warning' %} +
+ {{ message }} +
+ {% else %} +
+ {{ message }} +
+ {% endif %} + {% endfor %} + {% endif %} + {% endwith %} +
+ + \ No newline at end of file diff --git a/helpers/WikiExtractor.py b/helpers/WikiExtractor.py new file mode 100755 index 0000000..730b3ba --- /dev/null +++ b/helpers/WikiExtractor.py @@ -0,0 +1,3296 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# ============================================================================= +# Version: 2.75 (March 4, 2017) +# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa +# +# Contributors: +# Antonio Fuschetto (fuschett@aol.com) +# Leonardo Souza (lsouza@amtera.com.br) +# Juan Manuel Caicedo (juan@cavorite.com) +# Humberto Pereira (begini@gmail.com) +# Siegfried-A. Gevatter (siegfried@gevatter.com) +# Pedro Assis (pedroh2306@gmail.com) +# Wim Muskee (wimmuskee@gmail.com) +# Radics Geza (radicsge@gmail.com) +# orangain (orangain@gmail.com) +# Seth Cleveland (scleveland@turnitin.com) +# Bren Barn +# +# ============================================================================= +# Copyright (c) 2011-2017. Giuseppe Attardi (attardi@di.unipi.it). +# ============================================================================= +# This file is part of Tanl. +# +# Tanl is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License, version 3, +# as published by the Free Software Foundation. +# +# Tanl is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License at for more details. +# +# ============================================================================= + +"""Wikipedia Extractor: +Extracts and cleans text from a Wikipedia database dump and stores output in a +number of files of similar size in a given directory. +Each file will contain several documents in the format: + + + ... + + +If the program is invoked with the --json flag, then each file will +contain several documents formatted as json ojects, one per line, with +the following structure + + {"id": "", "revid": "", "url":"", "title": "", "text": "..."} + +Template expansion requires preprocesssng first the whole dump and +collecting template definitions. + +""" + +from __future__ import unicode_literals, division + +import sys +import argparse +import bz2 +import codecs +import cgi +import fileinput +import logging +import os.path +import re # TODO use regex when it will be standard +import time +import json +from io import StringIO +from multiprocessing import Queue, Process, Value, cpu_count +from timeit import default_timer + + +PY2 = sys.version_info[0] == 2 +# Python 2.7 compatibiity +if PY2: + from urllib import quote + from htmlentitydefs import name2codepoint + from itertools import izip as zip, izip_longest as zip_longest + range = xrange # Use Python 3 equivalent + chr = unichr # Use Python 3 equivalent + text_type = unicode + + class SimpleNamespace(object): + def __init__ (self, **kwargs): + self.__dict__.update(kwargs) + def __repr__ (self): + keys = sorted(self.__dict__) + items = ("{}={!r}".format(k, self.__dict__[k]) for k in keys) + return "{}({})".format(type(self).__name__, ", ".join(items)) + def __eq__ (self, other): + return self.__dict__ == other.__dict__ +else: + from urllib.parse import quote + from html.entities import name2codepoint + from itertools import zip_longest + from types import SimpleNamespace + text_type = str + + +# =========================================================================== + +# Program version +version = '2.75' + +## PARAMS #################################################################### + +options = SimpleNamespace( + + ## + # Defined in + # We include as default Template, when loading external template file. + knownNamespaces = {'Template': 10}, + + ## + # The namespace used for template definitions + # It is the name associated with namespace key=10 in the siteinfo header. + templateNamespace = '', + templatePrefix = '', + + ## + # The namespace used for module definitions + # It is the name associated with namespace key=828 in the siteinfo header. + moduleNamespace = '', + + ## + # Recognize only these namespaces in links + # w: Internal links to the Wikipedia + # wiktionary: Wiki dictionary + # wikt: shortcut for Wiktionary + # + acceptedNamespaces = ['w', 'wiktionary', 'wikt'], + + # This is obtained from + urlbase = '', + + ## + # Filter disambiguation pages + filter_disambig_pages = False, + + ## + # Drop tables from the article + keep_tables = False, + + ## + # Whether to preserve links in output + keepLinks = False, + + ## + # Whether to preserve section titles + keepSections = True, + + ## + # Whether to preserve lists + keepLists = False, + + ## + # Whether to output HTML instead of text + toHTML = False, + + ## + # Whether to write json instead of the xml-like default output format + write_json = False, + + ## + # Whether to expand templates + expand_templates = True, + + ## + ## Whether to escape doc content + escape_doc = False, + + ## + # Print the wikipedia article revision + print_revision = False, + + ## + # Minimum expanded text length required to print document + min_text_length = 0, + + # Shared objects holding templates, redirects and cache + templates = {}, + redirects = {}, + # cache of parser templates + # FIXME: sharing this with a Manager slows down. + templateCache = {}, + + # Elements to ignore/discard + + ignored_tag_patterns = [], + filter_category_include = set(), + filter_category_exclude = set(), + + log_file = None, + + discardElements = [ + 'gallery', 'timeline', 'noinclude', 'pre', + 'table', 'tr', 'td', 'th', 'caption', 'div', + 'form', 'input', 'select', 'option', 'textarea', + 'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir', + 'ref', 'references', 'img', 'imagemap', 'source', 'small', + 'sub', 'sup', 'indicator' + ], +) + +## +# Keys for Template and Module namespaces +templateKeys = set(['10', '828']) + +## +# Regex for identifying disambig pages +filter_disambig_page_pattern = re.compile("{{disambig(uation)?(\|[^}]*)?}}|__DISAMBIG__") + +## +g_page_total = 0 +g_page_articl_total=0 +g_page_articl_used_total=0 +# page filtering logic -- remove templates, undesired xml namespaces, and disambiguation pages +def keepPage(ns, catSet, page): + global g_page_articl_total,g_page_total,g_page_articl_used_total + g_page_total += 1 + if ns != '0': # Aritcle + return False + # remove disambig pages if desired + g_page_articl_total += 1 + if options.filter_disambig_pages: + for line in page: + if filter_disambig_page_pattern.match(line): + return False + if len(options.filter_category_include) > 0 and len(options.filter_category_include & catSet)==0: + logging.debug("***No include " + str(catSet)) + return False + if len(options.filter_category_exclude) > 0 and len(options.filter_category_exclude & catSet)>0: + logging.debug("***Exclude " + str(catSet)) + return False + g_page_articl_used_total += 1 + return True + + +def get_url(uid): + return "%s?curid=%s" % (options.urlbase, uid) + + +# ========================================================================= +# +# MediaWiki Markup Grammar +# https://www.mediawiki.org/wiki/Preprocessor_ABNF + +# xml-char = %x9 / %xA / %xD / %x20-D7FF / %xE000-FFFD / %x10000-10FFFF +# sptab = SP / HTAB + +# ; everything except ">" (%x3E) +# attr-char = %x9 / %xA / %xD / %x20-3D / %x3F-D7FF / %xE000-FFFD / %x10000-10FFFF + +# literal = *xml-char +# title = wikitext-L3 +# part-name = wikitext-L3 +# part-value = wikitext-L3 +# part = ( part-name "=" part-value ) / ( part-value ) +# parts = [ title *( "|" part ) ] +# tplarg = "{{{" parts "}}}" +# template = "{{" parts "}}" +# link = "[[" wikitext-L3 "]]" + +# comment = "" +# unclosed-comment = "', re.DOTALL) + + +# Match ... +nowiki = re.compile(r'.*?') + + +def ignoreTag(tag): + left = re.compile(r'<%s\b.*?>' % tag, re.IGNORECASE | re.DOTALL) # both and + right = re.compile(r'' % tag, re.IGNORECASE) + options.ignored_tag_patterns.append((left, right)) + +# Match selfClosing HTML tags +selfClosing_tag_patterns = [ + re.compile(r'<\s*%s\b[^>]*/\s*>' % tag, re.DOTALL | re.IGNORECASE) for tag in selfClosingTags + ] + +# Match HTML placeholder tags +placeholder_tag_patterns = [ + (re.compile(r'<\s*%s(\s*| [^>]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE), + repl) for tag, repl in placeholder_tags.items() + ] + +# Match preformatted lines +preformatted = re.compile(r'^ .*?$') + +# Match external links (space separates second optional parameter) +externalLink = re.compile(r'\[\w+[^ ]*? (.*?)]') +externalLinkNoAnchor = re.compile(r'\[\w+[&\]]*\]') + +# Matches bold/italic +bold_italic = re.compile(r"'''''(.*?)'''''") +bold = re.compile(r"'''(.*?)'''") +italic_quote = re.compile(r"''\"([^\"]*?)\"''") +italic = re.compile(r"''(.*?)''") +quote_quote = re.compile(r'""([^"]*?)""') + +# Matches space +spaces = re.compile(r' {2,}') + +# Matches dots +dots = re.compile(r'\.{4,}') + + +# ====================================================================== + + +class Template(list): + """ + A Template is a list of TemplateText or TemplateArgs + """ + + @classmethod + def parse(cls, body): + tpl = Template() + # we must handle nesting, s.a. + # {{{1|{{PAGENAME}}} + # {{{italics|{{{italic|}}} + # {{#if:{{{{{#if:{{{nominee|}}}|nominee|candidate}}|}}}| + # + start = 0 + for s, e in findMatchingBraces(body, 3): + tpl.append(TemplateText(body[start:s])) + tpl.append(TemplateArg(body[s + 3:e - 3])) + start = e + tpl.append(TemplateText(body[start:])) # leftover + return tpl + + + def subst(self, params, extractor, depth=0): + # We perform parameter substitutions recursively. + # We also limit the maximum number of iterations to avoid too long or + # even endless loops (in case of malformed input). + + # :see: http://meta.wikimedia.org/wiki/Help:Expansion#Distinction_between_variables.2C_parser_functions.2C_and_templates + # + # Parameter values are assigned to parameters in two (?) passes. + # Therefore a parameter name in a template can depend on the value of + # another parameter of the same template, regardless of the order in + # which they are specified in the template call, for example, using + # Template:ppp containing "{{{{{{p}}}}}}", {{ppp|p=q|q=r}} and even + # {{ppp|q=r|p=q}} gives r, but using Template:tvvv containing + # "{{{{{{{{{p}}}}}}}}}", {{tvvv|p=q|q=r|r=s}} gives s. + + # logging.debug('&*ssubst tpl %d %s', extractor.frame.length, '', depth, self) + + if depth > extractor.maxParameterRecursionLevels: + extractor.recursion_exceeded_3_errs += 1 + return '' + + return ''.join([tpl.subst(params, extractor, depth) for tpl in self]) + + def __str__(self): + return ''.join([text_type(x) for x in self]) + + +class TemplateText(text_type): + """Fixed text of template""" + + + def subst(self, params, extractor, depth): + return self + + +class TemplateArg(object): + """ + parameter to a template. + Has a name and a default value, both of which are Templates. + """ + + def __init__(self, parameter): + """ + :param parameter: the parts of a tplarg. + """ + # the parameter name itself might contain templates, e.g.: + # appointe{{#if:{{{appointer14|}}}|r|d}}14| + # 4|{{{{{subst|}}}CURRENTYEAR}} + + # any parts in a tplarg after the first (the parameter default) are + # ignored, and an equals sign in the first part is treated as plain text. + # logging.debug('TemplateArg %s', parameter) + + parts = splitParts(parameter) + self.name = Template.parse(parts[0]) + if len(parts) > 1: + # This parameter has a default value + self.default = Template.parse(parts[1]) + else: + self.default = None + + def __str__(self): + if self.default: + return '{{{%s|%s}}}' % (self.name, self.default) + else: + return '{{{%s}}}' % self.name + + + def subst(self, params, extractor, depth): + """ + Substitute value for this argument from dict :param params: + Use :param extractor: to evaluate expressions for name and default. + Limit substitution to the maximun :param depth:. + """ + # the parameter name itself might contain templates, e.g.: + # appointe{{#if:{{{appointer14|}}}|r|d}}14| + paramName = self.name.subst(params, extractor, depth + 1) + paramName = extractor.transform(paramName) + res = '' + if paramName in params: + res = params[paramName] # use parameter value specified in template invocation + elif self.default: # use the default value + defaultValue = self.default.subst(params, extractor, depth + 1) + res = extractor.transform(defaultValue) + # logging.debug('subst arg %d %s -> %s' % (depth, paramName, res)) + return res + + +class Frame(object): + + def __init__(self, title='', args=[], prev=None): + self.title = title + self.args = args + self.prev = prev + self.depth = prev.depth + 1 if prev else 0 + + + def push(self, title, args): + return Frame(title, args, self) + + + def pop(self): + return self.prev + + + def __str__(self): + res = '' + prev = self.prev + while prev: + if res: res += ', ' + res += '(%s, %s)' % (prev.title, prev.args) + prev = prev.prev + return '' + +# ====================================================================== + +substWords = 'subst:|safesubst:' + +class Extractor(object): + """ + An extraction task on a article. + """ + def __init__(self, id, revid, title, lines): + """ + :param id: id of page. + :param title: tutle of page. + :param lines: a list of lines. + """ + self.id = id + self.revid = revid + self.title = title + self.text = ''.join(lines) + self.magicWords = MagicWords() + self.frame = Frame() + self.recursion_exceeded_1_errs = 0 # template recursion within expand() + self.recursion_exceeded_2_errs = 0 # template recursion within expandTemplate() + self.recursion_exceeded_3_errs = 0 # parameter recursion + self.template_title_errs = 0 + + def write_output(self, out, text): + """ + :param out: a memory file + :param text: the text of the page + """ + url = get_url(self.id) + if options.write_json: + json_data = { + 'id': self.id, + 'url': url, + 'title': self.title, + 'text': "\n".join(text) + } + if options.print_revision: + json_data['revid'] = self.revid + # We don't use json.dump(data, out) because we want to be + # able to encode the string if the output is sys.stdout + out_str = json.dumps(json_data, ensure_ascii=False) + if out == sys.stdout: # option -a or -o - + out_str = out_str.encode('utf-8') + out.write(out_str) + out.write('\n') + else: + if options.print_revision: + header = '\n' % (self.id, self.revid, url, self.title) + else: + header = '\n' % (self.id, url, self.title) + footer = "\n\n" + if out == sys.stdout: # option -a or -o - + header = header.encode('utf-8') + out.write(header) + for line in text: + if out == sys.stdout: # option -a or -o - + line = line.encode('utf-8') + out.write(line) + out.write('\n') + out.write(footer) + + def extract(self, out): + """ + :param out: a memory file. + """ + logging.info('%s\t%s', self.id, self.title) + + # Separate header from text with a newline. + if options.toHTML: + title_str = '

' + self.title + '

' + else: + title_str = self.title + '\n' + # https://www.mediawiki.org/wiki/Help:Magic_words + colon = self.title.find(':') + if colon != -1: + ns = self.title[:colon] + pagename = self.title[colon+1:] + else: + ns = '' # Main + pagename = self.title + self.magicWords['NAMESPACE'] = ns + self.magicWords['NAMESPACENUMBER'] = options.knownNamespaces.get(ns, '0') + self.magicWords['PAGENAME'] = pagename + self.magicWords['FULLPAGENAME'] = self.title + slash = pagename.rfind('/') + if slash != -1: + self.magicWords['BASEPAGENAME'] = pagename[:slash] + self.magicWords['SUBPAGENAME'] = pagename[slash+1:] + else: + self.magicWords['BASEPAGENAME'] = pagename + self.magicWords['SUBPAGENAME'] = '' + slash = pagename.find('/') + if slash != -1: + self.magicWords['ROOTPAGENAME'] = pagename[:slash] + else: + self.magicWords['ROOTPAGENAME'] = pagename + self.magicWords['CURRENTYEAR'] = time.strftime('%Y') + self.magicWords['CURRENTMONTH'] = time.strftime('%m') + self.magicWords['CURRENTDAY'] = time.strftime('%d') + self.magicWords['CURRENTHOUR'] = time.strftime('%H') + self.magicWords['CURRENTTIME'] = time.strftime('%H:%M:%S') + text = self.text + self.text = '' # save memory + # + # @see https://doc.wikimedia.org/mediawiki-core/master/php/classParser.html + # This does the equivalent of internalParse(): + # + # $dom = $this->preprocessToDom( $text, $flag ); + # $text = $frame->expand( $dom ); + # + text = self.transform(text) + text = self.wiki2text(text) + text = compact(self.clean(text)) + # from zwChan + text = [title_str] + text + + if sum(len(line) for line in text) < options.min_text_length: + return + + self.write_output(out, text) + + errs = (self.template_title_errs, + self.recursion_exceeded_1_errs, + self.recursion_exceeded_2_errs, + self.recursion_exceeded_3_errs) + if any(errs): + logging.warn("Template errors in article '%s' (%s): title(%d) recursion(%d, %d, %d)", + self.title, self.id, *errs) + + + def transform(self, wikitext): + """ + Transforms wiki markup. + @see https://www.mediawiki.org/wiki/Help:Formatting + """ + # look for matching ... + res = '' + cur = 0 + for m in nowiki.finditer(wikitext, cur): + res += self.transform1(wikitext[cur:m.start()]) + wikitext[m.start():m.end()] + cur = m.end() + # leftover + res += self.transform1(wikitext[cur:]) + return res + + + def transform1(self, text): + """Transform text not containing """ + if options.expand_templates: + # expand templates + # See: http://www.mediawiki.org/wiki/Help:Templates + return self.expand(text) + else: + # Drop transclusions (template, parser functions) + return dropNested(text, r'{{', r'}}') + + + def wiki2text(self, text): + # + # final part of internalParse().) + # + # $text = $this->doTableStuff( $text ); + # $text = preg_replace( '/(^|\n)-----*/', '\\1
', $text ); + # $text = $this->doDoubleUnderscore( $text ); + # $text = $this->doHeadings( $text ); + # $text = $this->replaceInternalLinks( $text ); + # $text = $this->doAllQuotes( $text ); + # $text = $this->replaceExternalLinks( $text ); + # $text = str_replace( self::MARKER_PREFIX . 'NOPARSE', '', $text ); + # $text = $this->doMagicLinks( $text ); + # $text = $this->formatHeadings( $text, $origText, $isMain ); + + # Drop tables + # first drop residual templates, or else empty parameter |} might look like end of table. + if not options.keep_tables: + text = dropNested(text, r'{{', r'}}') + text = dropNested(text, r'{\|', r'\|}') + + # Handle bold/italic/quote + if options.toHTML: + text = bold_italic.sub(r'\1', text) + text = bold.sub(r'\1', text) + text = italic.sub(r'\1', text) + else: + text = bold_italic.sub(r'\1', text) + text = bold.sub(r'\1', text) + text = italic_quote.sub(r'"\1"', text) + text = italic.sub(r'"\1"', text) + text = quote_quote.sub(r'"\1"', text) + # residuals of unbalanced quotes + text = text.replace("'''", '').replace("''", '"') + + # replace internal links + text = replaceInternalLinks(text) + + # replace external links + text = replaceExternalLinks(text) + + # drop MagicWords behavioral switches + text = magicWordsRE.sub('', text) + + # ############### Process HTML ############### + + # turn into HTML, except for the content of + res = '' + cur = 0 + for m in syntaxhighlight.finditer(text): + res += unescape(text[cur:m.start()]) + m.group(1) + cur = m.end() + text = res + unescape(text[cur:]) + return text + + + def clean(self, text): + """ + Removes irrelevant parts from :param: text. + """ + + # Collect spans + spans = [] + # Drop HTML comments + for m in comment.finditer(text): + spans.append((m.start(), m.end())) + + # Drop self-closing tags + for pattern in selfClosing_tag_patterns: + for m in pattern.finditer(text): + spans.append((m.start(), m.end())) + + # Drop ignored tags + for left, right in options.ignored_tag_patterns: + for m in left.finditer(text): + spans.append((m.start(), m.end())) + for m in right.finditer(text): + spans.append((m.start(), m.end())) + + # Bulk remove all spans + text = dropSpans(spans, text) + + # Drop discarded elements + for tag in options.discardElements: + text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag) + + if not options.toHTML: + # Turn into text what is left (&nbsp;) and + text = unescape(text) + + # Expand placeholders + for pattern, placeholder in placeholder_tag_patterns: + index = 1 + for match in pattern.finditer(text): + text = text.replace(match.group(), '%s_%d' % (placeholder, index)) + index += 1 + + text = text.replace('<<', '«').replace('>>', '»') + + ############################################# + + # Cleanup text + text = text.replace('\t', ' ') + text = spaces.sub(' ', text) + text = dots.sub('...', text) + text = re.sub(' (,:\.\)\]»)', r'\1', text) + text = re.sub('(\[\(«) ', r'\1', text) + text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations + text = text.replace(',,', ',').replace(',.', '.') + if options.keep_tables: + # the following regular expressions are used to remove the wikiml chartacters around table strucutures + # yet keep the content. The order here is imporant so we remove certain markup like {| and then + # then the future html attributes such as 'style'. Finally we drop the remaining '|-' that delimits cells. + text = re.sub(r'!(?:\s)?style=\"[a-z]+:(?:\d+)%;\"', r'', text) + text = re.sub(r'!(?:\s)?style="[a-z]+:(?:\d+)%;[a-z]+:(?:#)?(?:[0-9a-z]+)?"', r'', text) + text = text.replace('|-', '') + text = text.replace('|', '') + if options.toHTML: + text = cgi.escape(text) + return text + + + # ---------------------------------------------------------------------- + # Expand templates + + maxTemplateRecursionLevels = 30 + maxParameterRecursionLevels = 10 + + # check for template beginning + reOpen = re.compile('(?= self.maxTemplateRecursionLevels: + self.recursion_exceeded_1_errs += 1 + return res + + # logging.debug('%*s %s', self.frame.depth, '', res) + return res + + + def templateParams(self, parameters): + """ + Build a dictionary with positional or name key to expanded parameters. + :param parameters: the parts[1:] of a template, i.e. all except the title. + """ + templateParams = {} + + if not parameters: + return templateParams + # logging.debug('%*s + # Parameters may span several lines, like: + # {{Reflist|colwidth=30em|refs= + # <ref name="Goode">Title</ref> + + # The '=' might occurr within an HTML attribute: + # "<ref name=value" + # but we stop at first. + m = re.match(' *([^=]*?) *?=(.*)', param, re.DOTALL) + if m: + # This is a named parameter. This case also handles parameter + # assignments like "2=xxx", where the number of an unnamed + # parameter ("2") is specified explicitly - this is handled + # transparently. + + parameterName = m.group(1).strip() + parameterValue = m.group(2) + + if ']]' not in parameterValue: # if the value does not contain a link, trim whitespace + parameterValue = parameterValue.strip() + templateParams[parameterName] = parameterValue + else: + # this is an unnamed parameter + unnamedParameterCounter += 1 + + if ']]' not in param: # if the value does not contain a link, trim whitespace + param = param.strip() + templateParams[str(unnamedParameterCounter)] = param + # logging.debug('%*stemplateParams> %s', self.frame.length, '', '|'.join(templateParams.values())) + return templateParams + + + def expandTemplate(self, body): + """Expands template invocation. + :param body: the parts of a template. + + :see http://meta.wikimedia.org/wiki/Help:Expansion for an explanation + of the process. + + See in particular: Expansion of names and values + http://meta.wikimedia.org/wiki/Help:Expansion#Expansion_of_names_and_values + + For most parser functions all names and values are expanded, + regardless of what is relevant for the result. The branching functions + (#if, #ifeq, #iferror, #ifexist, #ifexpr, #switch) are exceptions. + + All names in a template call are expanded, and the titles of the + tplargs in the template body, after which it is determined which + values must be expanded, and for which tplargs in the template body + the first part (default) [sic in the original doc page]. + + In the case of a tplarg, any parts beyond the first are never + expanded. The possible name and the value of the first part is + expanded if the title does not match a name in the template call. + + :see code for braceSubstitution at + https://doc.wikimedia.org/mediawiki-core/master/php/html/Parser_8php_source.html#3397: + + """ + + # template = "{{" parts "}}" + + # Templates and tplargs are decomposed in the same way, with pipes as + # separator, even though eventually any parts in a tplarg after the first + # (the parameter default) are ignored, and an equals sign in the first + # part is treated as plain text. + # Pipes inside inner templates and tplargs, or inside double rectangular + # brackets within the template or tplargs are not taken into account in + # this decomposition. + # The first part is called title, the other parts are simply called parts. + + # If a part has one or more equals signs in it, the first equals sign + # determines the division into name = value. Equals signs inside inner + # templates and tplargs, or inside double rectangular brackets within the + # part are not taken into account in this decomposition. Parts without + # equals sign are indexed 1, 2, .., given as attribute in the tag. + + if self.frame.depth >= self.maxTemplateRecursionLevels: + self.recursion_exceeded_2_errs += 1 + # logging.debug('%*sEXPAND> %s', self.frame.depth, '', body) + return '' + + logging.debug('%*sEXPAND %s', self.frame.depth, '', body) + parts = splitParts(body) + # title is the portion before the first | + title = parts[0].strip() + title = self.expand(title) + + # SUBST + # Apply the template tag to parameters without + # substituting into them, e.g. + # {{subst:t|a{{{p|q}}}b}} gives the wikitext start-a{{{p|q}}}b-end + # @see https://www.mediawiki.org/wiki/Manual:Substitution#Partial_substitution + subst = False + if re.match(substWords, title, re.IGNORECASE): + title = re.sub(substWords, '', title, 1, re.IGNORECASE) + subst = True + + if title in self.magicWords.values: + ret = self.magicWords[title] + logging.debug('%*s 1: + funct = title[:colon] + parts[0] = title[colon + 1:].strip() # side-effect (parts[0] not used later) + # arguments after first are not evaluated + ret = callParserFunction(funct, parts, self) + logging.debug('%*s 1: + # rest are new parameters + parameters.extend(par[1:]) + else: + parameters = par + elif not parameters: + parameters = [''] # create first param + # add span to last previous parameter + parameters[-1] += paramsList[s:e] + cur = e + # leftover + par = paramsList[cur:].split(sep) + if par: + if parameters: + # portion before | belongs to previous parameter + parameters[-1] += par[0] + if len(par) > 1: + # rest are new parameters + parameters.extend(par[1:]) + else: + parameters = par + + # logging.debug('splitParts %s %s\nparams: %s', sep, paramsList, text_type(parameters)) + return parameters + + +def findMatchingBraces(text, ldelim=0): + """ + :param ldelim: number of braces to match. 0 means match [[]], {{}} and {{{}}}. + """ + # Parsing is done with respect to pairs of double braces {{..}} delimiting + # a template, and pairs of triple braces {{{..}}} delimiting a tplarg. + # If double opening braces are followed by triple closing braces or + # conversely, this is taken as delimiting a template, with one left-over + # brace outside it, taken as plain text. For any pattern of braces this + # defines a set of templates and tplargs such that any two are either + # separate or nested (not overlapping). + + # Unmatched double rectangular closing brackets can be in a template or + # tplarg, but unmatched double rectangular opening brackets cannot. + # Unmatched double or triple closing braces inside a pair of + # double rectangular brackets are treated as plain text. + # Other formulation: in ambiguity between template or tplarg on one hand, + # and a link on the other hand, the structure with the rightmost opening + # takes precedence, even if this is the opening of a link without any + # closing, so not producing an actual link. + + # In the case of more than three opening braces the last three are assumed + # to belong to a tplarg, unless there is no matching triple of closing + # braces, in which case the last two opening braces are are assumed to + # belong to a template. + + # We must skip individual { like in: + # {{#ifeq: {{padleft:|1|}} | { | |  }} + # We must resolve ambiguities like this: + # {{{{ }}}} -> { {{{ }}} } + # {{{{{ }}}}} -> {{ {{{ }}} }} + # {{#if:{{{{{#if:{{{nominee|}}}|nominee|candidate}}|}}}|...}} + # {{{!}} {{!}}} + + # Handle: + # {{{{{|safesubst:}}}#Invoke:String|replace|{{{1|{{{{{|safesubst:}}}PAGENAME}}}}}|%s+%([^%(]-%)$||plain=false}} + # as well as expressions with stray }: + # {{{link|{{ucfirst:{{{1}}}}}} interchange}}} + + if ldelim: # 2-3 + reOpen = re.compile('[{]{%d,}' % ldelim) # at least ldelim + reNext = re.compile('[{]{2,}|}{2,}') # at least 2 + else: + reOpen = re.compile('{{2,}|\[{2,}') + reNext = re.compile('{{2,}|}{2,}|\[{2,}|]{2,}') # at least 2 + + cur = 0 + while True: + m1 = reOpen.search(text, cur) + if not m1: + return + lmatch = m1.end() - m1.start() + if m1.group()[0] == '{': + stack = [lmatch] # stack of opening braces lengths + else: + stack = [-lmatch] # negative means [ + end = m1.end() + while True: + m2 = reNext.search(text, end) + if not m2: + return # unbalanced + end = m2.end() + brac = m2.group()[0] + lmatch = m2.end() - m2.start() + + if brac == '{': + stack.append(lmatch) + elif brac == '}': + while stack: + openCount = stack.pop() # opening span + if openCount == 0: # illegal unmatched [[ + continue + if lmatch >= openCount: + lmatch -= openCount + if lmatch <= 1: # either close or stray } + break + else: + # put back unmatched + stack.append(openCount - lmatch) + break + if not stack: + yield m1.start(), end - lmatch + cur = end + break + elif len(stack) == 1 and 0 < stack[0] < ldelim: + # ambiguous {{{{{ }}} }} + #yield m1.start() + stack[0], end + cur = end + break + elif brac == '[': # [[ + stack.append(-lmatch) + else: # ]] + while stack and stack[-1] < 0: # matching [[ + openCount = -stack.pop() + if lmatch >= openCount: + lmatch -= openCount + if lmatch <= 1: # either close or stray ] + break + else: + # put back unmatched (negative) + stack.append(lmatch - openCount) + break + if not stack: + yield m1.start(), end - lmatch + cur = end + break + # unmatched ]] are discarded + cur = end + + +def findBalanced(text, openDelim=['[['], closeDelim=[']]']): + """ + Assuming that text contains a properly balanced expression using + :param openDelim: as opening delimiters and + :param closeDelim: as closing delimiters. + :return: an iterator producing pairs (start, end) of start and end + positions in text containing a balanced expression. + """ + openPat = '|'.join([re.escape(x) for x in openDelim]) + # pattern for delimiters expected after each opening delimiter + afterPat = {o: re.compile(openPat + '|' + c, re.DOTALL) for o, c in zip(openDelim, closeDelim)} + stack = [] + start = 0 + cur = 0 + # end = len(text) + startSet = False + startPat = re.compile(openPat) + nextPat = startPat + while True: + next = nextPat.search(text, cur) + if not next: + return + if not startSet: + start = next.start() + startSet = True + delim = next.group(0) + if delim in openDelim: + stack.append(delim) + nextPat = afterPat[delim] + else: + opening = stack.pop() + # assert opening == openDelim[closeDelim.index(next.group(0))] + if stack: + nextPat = afterPat[stack[-1]] + else: + yield start, next.end() + nextPat = startPat + start = next.end() + startSet = False + cur = next.end() + + +# ---------------------------------------------------------------------- +# Modules + +# Only minimal support +# FIXME: import Lua modules. + +def if_empty(*rest): + """ + This implements If_empty from English Wikipedia module: + + Module:If empty + 828 + local p = {} + + function p.main(frame) + local args = require('Module:Arguments').getArgs(frame, {wrappers = 'Template:If empty', removeBlanks = false}) + + -- For backwards compatibility reasons, the first 8 parameters can be unset instead of being blank, + -- even though there's really no legitimate use case for this. At some point, this will be removed. + local lowestNil = math.huge + for i = 8,1,-1 do + if args[i] == nil then + args[i] = '' + lowestNil = i + end + end + + for k,v in ipairs(args) do + if v ~= '' then + if lowestNil < k then + -- If any uses of this template depend on the behavior above, add them to a tracking category. + -- This is a rather fragile, convoluted, hacky way to do it, but it ensures that this module's output won't be modified + -- by it. + frame:extensionTag('ref', '[[Category:Instances of Template:If_empty missing arguments]]', {group = 'TrackingCategory'}) + frame:extensionTag('references', '', {group = 'TrackingCategory'}) + end + return v + end + end + end + + return p + """ + for arg in rest: + if arg: + return arg + return '' + + +# ---------------------------------------------------------------------- +# String module emulation +# https://en.wikipedia.org/wiki/Module:String + +def functionParams(args, vars): + """ + Build a dictionary of var/value from :param: args. + Parameters can be either named or unnamed. In the latter case, their + name is taken fron :param: vars. + """ + params = {} + index = 1 + for var in vars: + value = args.get(var) + if value is None: + value = args.get(str(index)) # positional argument + if value is None: + value = '' + else: + index += 1 + params[var] = value + return params + + +def string_sub(args): + params = functionParams(args, ('s', 'i', 'j')) + s = params.get('s', '') + i = int(params.get('i', 1) or 1) # or handles case of '' value + j = int(params.get('j', -1) or -1) + if i > 0: i -= 1 # lua is 1-based + if j < 0: j += 1 + if j == 0: j = len(s) + return s[i:j] + + +def string_sublength(args): + params = functionParams(args, ('s', 'i', 'len')) + s = params.get('s', '') + i = int(params.get('i', 1) or 1) - 1 # lua is 1-based + len = int(params.get('len', 1) or 1) + return s[i:i+len] + + +def string_len(args): + params = functionParams(args, ('s')) + s = params.get('s', '') + return len(s) + + +def string_find(args): + params = functionParams(args, ('source', 'target', 'start', 'plain')) + source = params.get('source', '') + pattern = params.get('target', '') + start = int('0'+params.get('start', 1)) - 1 # lua is 1-based + plain = int('0'+params.get('plain', 1)) + if source == '' or pattern == '': + return 0 + if plain: + return source.find(pattern, start) + 1 # lua is 1-based + else: + return (re.compile(pattern).search(source, start) or -1) + 1 + + +def string_pos(args): + params = functionParams(args, ('target', 'pos')) + target = params.get('target', '') + pos = int(params.get('pos', 1) or 1) + if pos > 0: + pos -= 1 # The first character has an index value of 1 + return target[pos] + + +def string_replace(args): + params = functionParams(args, ('source', 'pattern', 'replace', 'count', 'plain')) + source = params.get('source', '') + pattern = params.get('pattern', '') + replace = params.get('replace', '') + count = int(params.get('count', 0) or 0) + plain = int(params.get('plain', 1) or 1) + if plain: + if count: + return source.replace(pattern, replace, count) + else: + return source.replace(pattern, replace) + else: + return re.compile(pattern).sub(replace, source, count) + + +def string_rep(args): + params = functionParams(args, ('s')) + source = params.get('source', '') + count = int(params.get('count', '1')) + return source * count + + +# ---------------------------------------------------------------------- +# Module:Roman +# http://en.wikipedia.org/w/index.php?title=Module:Roman +# Modulo:Numero_romano +# https://it.wikipedia.org/wiki/Modulo:Numero_romano + +def roman_main(args): + """Convert first arg to roman numeral if <= 5000 else :return: second arg.""" + num = int(float(args.get('1'))) + + # Return a message for numbers too big to be expressed in Roman numerals. + if 0 > num or num >= 5000: + return args.get('2', 'N/A') + + def toRoman(n, romanNumeralMap): + """convert integer to Roman numeral""" + result = "" + for integer, numeral in romanNumeralMap: + while n >= integer: + result += numeral + n -= integer + return result + + # Find the Roman numerals for numbers 4999 or less. + smallRomans = ( + (1000, "M"), + (900, "CM"), (500, "D"), (400, "CD"), (100, "C"), + (90, "XC"), (50, "L"), (40, "XL"), (10, "X"), + (9, "IX"), (5, "V"), (4, "IV"), (1, "I") + ) + return toRoman(num, smallRomans) + +# ---------------------------------------------------------------------- + +modules = { + 'convert': { + 'convert': lambda x, u, *rest: x + ' ' + u, # no conversion + }, + + 'If empty': { + 'main': if_empty + }, + + 'String': { + 'len': string_len, + 'sub': string_sub, + 'sublength': string_sublength, + 'pos': string_pos, + 'find': string_find, + 'replace': string_replace, + 'rep': string_rep, + }, + + 'Roman': { + 'main': roman_main + }, + + 'Numero romano': { + 'main': roman_main + } +} + +# ---------------------------------------------------------------------- +# variables + + +class MagicWords(object): + """ + One copy in each Extractor. + + @see https://doc.wikimedia.org/mediawiki-core/master/php/MagicWord_8php_source.html + """ + names = [ + '!', + 'currentmonth', + 'currentmonth1', + 'currentmonthname', + 'currentmonthnamegen', + 'currentmonthabbrev', + 'currentday', + 'currentday2', + 'currentdayname', + 'currentyear', + 'currenttime', + 'currenthour', + 'localmonth', + 'localmonth1', + 'localmonthname', + 'localmonthnamegen', + 'localmonthabbrev', + 'localday', + 'localday2', + 'localdayname', + 'localyear', + 'localtime', + 'localhour', + 'numberofarticles', + 'numberoffiles', + 'numberofedits', + 'articlepath', + 'pageid', + 'sitename', + 'server', + 'servername', + 'scriptpath', + 'stylepath', + 'pagename', + 'pagenamee', + 'fullpagename', + 'fullpagenamee', + 'namespace', + 'namespacee', + 'namespacenumber', + 'currentweek', + 'currentdow', + 'localweek', + 'localdow', + 'revisionid', + 'revisionday', + 'revisionday2', + 'revisionmonth', + 'revisionmonth1', + 'revisionyear', + 'revisiontimestamp', + 'revisionuser', + 'revisionsize', + 'subpagename', + 'subpagenamee', + 'talkspace', + 'talkspacee', + 'subjectspace', + 'subjectspacee', + 'talkpagename', + 'talkpagenamee', + 'subjectpagename', + 'subjectpagenamee', + 'numberofusers', + 'numberofactiveusers', + 'numberofpages', + 'currentversion', + 'rootpagename', + 'rootpagenamee', + 'basepagename', + 'basepagenamee', + 'currenttimestamp', + 'localtimestamp', + 'directionmark', + 'contentlanguage', + 'numberofadmins', + 'cascadingsources', + ] + + def __init__(self): + self.values = {'!': '|'} + + def __getitem__(self, name): + return self.values.get(name) + + def __setitem__(self, name, value): + self.values[name] = value + + switches = ( + '__NOTOC__', + '__FORCETOC__', + '__TOC__', + '__TOC__', + '__NEWSECTIONLINK__', + '__NONEWSECTIONLINK__', + '__NOGALLERY__', + '__HIDDENCAT__', + '__NOCONTENTCONVERT__', + '__NOCC__', + '__NOTITLECONVERT__', + '__NOTC__', + '__START__', + '__END__', + '__INDEX__', + '__NOINDEX__', + '__STATICREDIRECT__', + '__DISAMBIG__' + ) + + +magicWordsRE = re.compile('|'.join(MagicWords.switches)) + + +# ---------------------------------------------------------------------- +# parser functions utilities + + +def ucfirst(string): + """:return: a string with just its first character uppercase + We can't use title() since it coverts all words. + """ + if string: + return string[0].upper() + string[1:] + else: + return '' + + +def lcfirst(string): + """:return: a string with its first character lowercase""" + if string: + if len(string) > 1: + return string[0].lower() + string[1:] + else: + return string.lower() + else: + return '' + + +def fullyQualifiedTemplateTitle(templateTitle): + """ + Determine the namespace of the page being included through the template + mechanism + """ + if templateTitle.startswith(':'): + # Leading colon by itself implies main namespace, so strip this colon + return ucfirst(templateTitle[1:]) + else: + m = re.match('([^:]*)(:.*)', templateTitle) + if m: + # colon found but not in the first position - check if it + # designates a known namespace + prefix = normalizeNamespace(m.group(1)) + if prefix in options.knownNamespaces: + return prefix + ucfirst(m.group(2)) + # The title of the page being included is NOT in the main namespace and + # lacks any other explicit designation of the namespace - therefore, it + # is resolved to the Template namespace (that's the default for the + # template inclusion mechanism). + + # This is a defense against pages whose title only contains UTF-8 chars + # that are reduced to an empty string. Right now I can think of one such + # case - which represents the non-breaking space. + # In this particular case, this page is a redirect to [[Non-nreaking + # space]], but having in the system a redirect page with an empty title + # causes numerous problems, so we'll live happier without it. + if templateTitle: + return options.templatePrefix + ucfirst(templateTitle) + else: + return '' # caller may log as error + + +def normalizeNamespace(ns): + return ucfirst(ns) + + +# ---------------------------------------------------------------------- +# Parser functions +# see http://www.mediawiki.org/wiki/Help:Extension:ParserFunctions +# https://github.com/Wikia/app/blob/dev/extensions/ParserFunctions/ParserFunctions_body.php + + +class Infix: + """Infix operators. + The calling sequence for the infix is: + x |op| y + """ + + def __init__(self, function): + self.function = function + + def __ror__(self, other): + return Infix(lambda x, self=self, other=other: self.function(other, x)) + + def __or__(self, other): + return self.function(other) + + def __rlshift__(self, other): + return Infix(lambda x, self=self, other=other: self.function(other, x)) + + def __rshift__(self, other): + return self.function(other) + + def __call__(self, value1, value2): + return self.function(value1, value2) + + +ROUND = Infix(lambda x, y: round(x, y)) + + +from math import floor, ceil, pi, e, trunc, exp, log as ln, sin, cos, tan, asin, acos, atan + + +def sharp_expr(extr, expr): + """Tries converting a lua expr into a Python expr.""" + try: + expr = extr.expand(expr) + expr = re.sub('(?])=', '==', expr) # negative lookbehind + expr = re.sub('mod', '%', expr) # no \b here + expr = re.sub('\bdiv\b', '/', expr) + expr = re.sub('\bround\b', '|ROUND|', expr) + return text_type(eval(expr)) + except: + return '%s' % expr + + +def sharp_if(extr, testValue, valueIfTrue, valueIfFalse=None, *args): + # In theory, we should evaluate the first argument here, + # but it was evaluated while evaluating part[0] in expandTemplate(). + if testValue.strip(): + # The {{#if:}} function is an if-then-else construct. + # The applied condition is: "The condition string is non-empty". + valueIfTrue = extr.expand(valueIfTrue.strip()) # eval + if valueIfTrue: + return valueIfTrue + elif valueIfFalse: + return extr.expand(valueIfFalse.strip()) # eval + return "" + + +def sharp_ifeq(extr, lvalue, rvalue, valueIfTrue, valueIfFalse=None, *args): + rvalue = rvalue.strip() + if rvalue: + # lvalue is always evaluated + if lvalue.strip() == rvalue: + # The {{#ifeq:}} function is an if-then-else construct. The + # applied condition is "is rvalue equal to lvalue". Note that this + # does only string comparison while MediaWiki implementation also + # supports numerical comparissons. + + if valueIfTrue: + return extr.expand(valueIfTrue.strip()) + else: + if valueIfFalse: + return extr.expand(valueIfFalse.strip()) + return "" + + +def sharp_iferror(extr, test, then='', Else=None, *args): + if re.match('<(?:strong|span|p|div)\s(?:[^\s>]*\s+)*?class="(?:[^"\s>]*\s+)*?error(?:\s[^">]*)?"', test): + return extr.expand(then.strip()) + elif Else is None: + return test.strip() + else: + return extr.expand(Else.strip()) + + +def sharp_switch(extr, primary, *params): + # FIXME: we don't support numeric expressions in primary + + # {{#switch: comparison string + # | case1 = result1 + # | case2 + # | case4 = result2 + # | 1 | case5 = result3 + # | #default = result4 + # }} + + primary = primary.strip() + found = False # for fall through cases + default = None + rvalue = None + lvalue = '' + for param in params: + # handle cases like: + # #default = [http://www.perseus.tufts.edu/hopper/text?doc=Perseus...] + pair = param.split('=', 1) + lvalue = extr.expand(pair[0].strip()) + rvalue = None + if len(pair) > 1: + # got "=" + rvalue = extr.expand(pair[1].strip()) + # check for any of multiple values pipe separated + if found or primary in [v.strip() for v in lvalue.split('|')]: + # Found a match, return now + return rvalue + elif lvalue == '#default': + default = rvalue + rvalue = None # avoid defaulting to last case + elif lvalue == primary: + # If the value matches, set a flag and continue + found = True + # Default case + # Check if the last item had no = sign, thus specifying the default case + if rvalue is not None: + return lvalue + elif default is not None: + return default + return '' + + +# Extension Scribunto: https://www.mediawiki.org/wiki/Extension:Scribunto +def sharp_invoke(module, function, args): + functions = modules.get(module) + if functions: + funct = functions.get(function) + if funct: + return text_type(funct(args)) + return '' + + +parserFunctions = { + + '#expr': sharp_expr, + + '#if': sharp_if, + + '#ifeq': sharp_ifeq, + + '#iferror': sharp_iferror, + + '#ifexpr': lambda *args: '', # not supported + + '#ifexist': lambda extr, title, ifex, ifnex: extr.expand(ifnex), # assuming title is not present + + '#rel2abs': lambda *args: '', # not supported + + '#switch': sharp_switch, + + '#language': lambda *args: '', # not supported + + '#time': lambda *args: '', # not supported + + '#timel': lambda *args: '', # not supported + + '#titleparts': lambda *args: '', # not supported + + # This function is used in some pages to construct links + # http://meta.wikimedia.org/wiki/Help:URL + 'urlencode': lambda extr, string, *rest: quote(string.encode('utf-8')), + + 'lc': lambda extr, string, *rest: string.lower() if string else '', + + 'lcfirst': lambda extr, string, *rest: lcfirst(string), + + 'uc': lambda extr, string, *rest: string.upper() if string else '', + + 'ucfirst': lambda extr, string, *rest: ucfirst(string), + + 'int': lambda extr, string, *rest: text_type(int(string)), + +} + + +def callParserFunction(functionName, args, extractor): + """ + Parser functions have similar syntax as templates, except that + the first argument is everything after the first colon. + :return: the result of the invocation, None in case of failure. + + :param: args not yet expanded (see branching functions). + https://www.mediawiki.org/wiki/Help:Extension:ParserFunctions + """ + + try: + # https://it.wikipedia.org/wiki/Template:Str_endswith has #Invoke + functionName = functionName.lower() + if functionName == '#invoke': + module, fun = args[0].strip(), args[1].strip() + logging.debug('%*s#invoke %s %s %s', extractor.frame.depth, '', module, fun, args[2:]) + # special handling of frame + if len(args) == 2: + # find parameters in frame whose title is the one of the original + # template invocation + templateTitle = fullyQualifiedTemplateTitle(module) + if not templateTitle: + logging.warn("Template with empty title") + params = None + frame = extractor.frame + while frame: + if frame.title == templateTitle: + params = frame.args + break + frame = frame.prev + else: + params = [extractor.transform(p) for p in args[2:]] # evaluates them + params = extractor.templateParams(params) + ret = sharp_invoke(module, fun, params) + logging.debug('%*s<#invoke %s %s %s', extractor.frame.depth, '', module, fun, ret) + return ret + if functionName in parserFunctions: + # branching functions use the extractor to selectively evaluate args + return parserFunctions[functionName](extractor, *args) + except: + return "" # FIXME: fix errors + return "" + + +# ---------------------------------------------------------------------- +# Expand using WikiMedia API +# import json + +# def expand(text): +# """Expand templates invoking MediaWiki API""" +# text = urlib.urlencodew(text.encode('utf-8')) +# base = urlbase[:urlbase.rfind('/')] +# url = base + "/w/api.php?action=expandtemplates&format=json&text=" + text +# exp = json.loads(urllib.urlopen(url)) +# return exp['expandtemplates']['*'] + +# ---------------------------------------------------------------------- +# Extract Template definition + +reNoinclude = re.compile(r'(?:.*?)', re.DOTALL) +reIncludeonly = re.compile(r'|', re.DOTALL) + +def define_template(title, page): + """ + Adds a template defined in the :param page:. + @see https://en.wikipedia.org/wiki/Help:Template#Noinclude.2C_includeonly.2C_and_onlyinclude + """ + # title = normalizeTitle(title) + + # sanity check (empty template, e.g. Template:Crude Oil Prices)) + if not page: return + + # check for redirects + m = re.match('#REDIRECT.*?\[\[([^\]]*)]]', page[0], re.IGNORECASE) + if m: + options.redirects[title] = m.group(1) # normalizeTitle(m.group(1)) + return + + text = unescape(''.join(page)) + + # We're storing template text for future inclusion, therefore, + # remove all text and keep all text + # (but eliminate tags per se). + # However, if ... parts are present, + # then only keep them and discard the rest of the template body. + # This is because using on a text fragment is + # equivalent to enclosing it in tags **AND** + # enclosing all the rest of the template body in tags. + + # remove comments + text = comment.sub('', text) + + # eliminate fragments + text = reNoinclude.sub('', text) + # eliminate unterminated elements + text = re.sub(r'.*$', '', text, flags=re.DOTALL) + text = re.sub(r'', '', text) + + onlyincludeAccumulator = '' + for m in re.finditer('(.*?)', text, re.DOTALL): + onlyincludeAccumulator += m.group(1) + if onlyincludeAccumulator: + text = onlyincludeAccumulator + else: + text = reIncludeonly.sub('', text) + + if text: + if title in options.templates: + logging.warn('Redefining: %s', title) + options.templates[title] = text + + +# ---------------------------------------------------------------------- + +def dropNested(text, openDelim, closeDelim): + """ + A matching function for nested expressions, e.g. namespaces and tables. + """ + openRE = re.compile(openDelim, re.IGNORECASE) + closeRE = re.compile(closeDelim, re.IGNORECASE) + # partition text in separate blocks { } { } + spans = [] # pairs (s, e) for each partition + nest = 0 # nesting level + start = openRE.search(text, 0) + if not start: + return text + end = closeRE.search(text, start.end()) + next = start + while end: + next = openRE.search(text, next.end()) + if not next: # termination + while nest: # close all pending + nest -= 1 + end0 = closeRE.search(text, end.end()) + if end0: + end = end0 + else: + break + spans.append((start.start(), end.end())) + break + while end.end() < next.start(): + # { } { + if nest: + nest -= 1 + # try closing more + last = end.end() + end = closeRE.search(text, end.end()) + if not end: # unbalanced + if spans: + span = (spans[0][0], last) + else: + span = (start.start(), last) + spans = [span] + break + else: + spans.append((start.start(), end.end())) + # advance start, find next close + start = next + end = closeRE.search(text, next.end()) + break # { } + if next != start: + # { { } + nest += 1 + # collect text outside partitions + return dropSpans(spans, text) + + +def dropSpans(spans, text): + """ + Drop from text the blocks identified in :param spans:, possibly nested. + """ + spans.sort() + res = '' + offset = 0 + for s, e in spans: + if offset <= s: # handle nesting + if offset < s: + res += text[offset:s] + offset = e + res += text[offset:] + return res + + +# ---------------------------------------------------------------------- +# WikiLinks + +# May be nested [[File:..|..[[..]]..|..]], [[Category:...]], etc. +# Also: [[Help:IPA for Catalan|[andora]]] + + +def replaceInternalLinks(text): + """ + Replaces internal links of the form: + [[title |...|label]]trail + + with title concatenated with trail, when present, e.g. 's' for plural. + + See https://www.mediawiki.org/wiki/Help:Links#Internal_links + """ + # call this after removal of external links, so we need not worry about + # triple closing ]]]. + cur = 0 + res = '' + for s, e in findBalanced(text): + m = tailRE.match(text, e) + if m: + trail = m.group(0) + end = m.end() + else: + trail = '' + end = e + inner = text[s + 2:e - 2] + # find first | + pipe = inner.find('|') + if pipe < 0: + title = inner + label = title + else: + title = inner[:pipe].rstrip() + # find last | + curp = pipe + 1 + for s1, e1 in findBalanced(inner): + last = inner.rfind('|', curp, s1) + if last >= 0: + pipe = last # advance + curp = e1 + label = inner[pipe + 1:].strip() + res += text[cur:s] + makeInternalLink(title, label) + trail + cur = end + return res + text[cur:] + + +# the official version is a method in class Parser, similar to this: +# def replaceInternalLinks2(text): +# global wgExtraInterlanguageLinkPrefixes + +# # the % is needed to support urlencoded titles as well +# tc = Title::legalChars() + '#%' +# # Match a link having the form [[namespace:link|alternate]]trail +# e1 = re.compile("([%s]+)(?:\\|(.+?))?]](.*)" % tc, re.S | re.D) +# # Match cases where there is no "]]", which might still be images +# e1_img = re.compile("([%s]+)\\|(.*)" % tc, re.S | re.D) + +# holders = LinkHolderArray(self) + +# # split the entire text string on occurrences of [[ +# iterBrackets = re.compile('[[').finditer(text) + +# m in iterBrackets.next() +# # get the first element (all text up to first [[) +# s = text[:m.start()] +# cur = m.end() + +# line = s + +# useLinkPrefixExtension = self.getTargetLanguage().linkPrefixExtension() +# e2 = None +# if useLinkPrefixExtension: +# # Match the end of a line for a word that is not followed by whitespace, +# # e.g. in the case of "The Arab al[[Razi]]", "al" will be matched +# global wgContLang +# charset = wgContLang.linkPrefixCharset() +# e2 = re.compile("((?>.*[^charset]|))(.+)", re.S | re.D | re.U) + +# if self.mTitle is None: +# raise MWException(__METHOD__ + ": \self.mTitle is null\n") + +# nottalk = not self.mTitle.isTalkPage() + +# if useLinkPrefixExtension: +# m = e2.match(s) +# if m: +# first_prefix = m.group(2) +# else: +# first_prefix = false +# else: +# prefix = '' + +# useSubpages = self.areSubpagesAllowed() + +# for m in iterBrackets: +# line = text[cur:m.start()] +# cur = m.end() + +# # TODO: Check for excessive memory usage + +# if useLinkPrefixExtension: +# m = e2.match(e2) +# if m: +# prefix = m.group(2) +# s = m.group(1) +# else: +# prefix = '' +# # first link +# if first_prefix: +# prefix = first_prefix +# first_prefix = False + +# might_be_img = False + +# m = e1.match(line) +# if m: # page with normal label or alt +# label = m.group(2) +# # If we get a ] at the beginning of m.group(3) that means we have a link that is something like: +# # [[Image:Foo.jpg|[http://example.com desc]]] <- having three ] in a row fucks up, +# # the real problem is with the e1 regex +# # See bug 1300. +# # +# # Still some problems for cases where the ] is meant to be outside punctuation, +# # and no image is in sight. See bug 2095. +# # +# if label and m.group(3)[0] == ']' and '[' in label: +# label += ']' # so that replaceExternalLinks(label) works later +# m.group(3) = m.group(3)[1:] +# # fix up urlencoded title texts +# if '%' in m.group(1): +# # Should anchors '#' also be rejected? +# m.group(1) = str_replace(array('<', '>'), array('<', '>'), rawurldecode(m.group(1))) +# trail = m.group(3) +# else: +# m = e1_img.match(line): +# if m: +# # Invalid, but might be an image with a link in its caption +# might_be_img = true +# label = m.group(2) +# if '%' in m.group(1): +# m.group(1) = rawurldecode(m.group(1)) +# trail = "" +# else: # Invalid form; output directly +# s += prefix + '[[' + line +# continue + +# origLink = m.group(1) + +# # Dont allow internal links to pages containing +# # PROTO: where PROTO is a valid URL protocol these +# # should be external links. +# if (preg_match('/^(?i:' + self.mUrlProtocols + ')/', origLink)) { +# s += prefix + '[[' + line +# continue +# } + +# # Make subpage if necessary +# if useSubpages: +# link = self.maybeDoSubpageLink(origLink, label) +# else: +# link = origLink + +# noforce = origLink[0] != ':' +# if not noforce: +# # Strip off leading ':' +# link = link[1:] + +# nt = Title::newFromText(self.mStripState.unstripNoWiki(link)) +# if nt is None: +# s += prefix + '[[' + line +# continue + +# ns = nt.getNamespace() +# iw = nt.getInterwiki() + +# if might_be_img { # if this is actually an invalid link +# if (ns == NS_FILE and noforce) { # but might be an image +# found = False +# while True: +# # look at the next 'line' to see if we can close it there +# next_line = iterBrakets.next() +# if not next_line: +# break +# m = explode(']]', next_line, 3) +# if m.lastindex == 3: +# # the first ]] closes the inner link, the second the image +# found = True +# label += "[[%s]]%s" % (m.group(0), m.group(1)) +# trail = m.group(2) +# break +# elif m.lastindex == 2: +# # if there is exactly one ]] that is fine, we will keep looking +# label += "[[{m[0]}]]{m.group(1)}" +# else: +# # if next_line is invalid too, we need look no further +# label += '[[' + next_line +# break +# if not found: +# # we couldnt find the end of this imageLink, so output it raw +# # but dont ignore what might be perfectly normal links in the text we ve examined +# holders.merge(self.replaceInternalLinks2(label)) +# s += "{prefix}[[%s|%s" % (link, text) +# # note: no trail, because without an end, there *is* no trail +# continue +# } else: # it is not an image, so output it raw +# s += "{prefix}[[%s|%s" % (link, text) +# # note: no trail, because without an end, there *is* no trail +# continue +# } + +# wasblank = (text == '') +# if wasblank: +# text = link +# else: +# # Bug 4598 madness. Handle the quotes only if they come from the alternate part +# # [[Lista d''e paise d''o munno]] . Lista d''e paise d''o munno +# # [[Criticism of Harry Potter|Criticism of ''Harry Potter'']] +# # . Criticism of Harry Potter +# text = self.doQuotes(text) + +# # Link not escaped by : , create the various objects +# if noforce and not nt.wasLocalInterwiki(): +# # Interwikis +# if iw and mOptions.getInterwikiMagic() and nottalk and ( +# Language::fetchLanguageName(iw, None, 'mw') or +# in_array(iw, wgExtraInterlanguageLinkPrefixes)): +# # Bug 24502: filter duplicates +# if iw not in mLangLinkLanguages: +# self.mLangLinkLanguages[iw] = True +# self.mOutput.addLanguageLink(nt.getFullText()) + +# s = rstrip(s + prefix) +# s += strip(trail, "\n") == '' ? '': prefix + trail +# continue + +# if ns == NS_FILE: +# if not wfIsBadImage(nt.getDBkey(), self.mTitle): +# if wasblank: +# # if no parameters were passed, text +# # becomes something like "File:Foo.png", +# # which we dont want to pass on to the +# # image generator +# text = '' +# else: +# # recursively parse links inside the image caption +# # actually, this will parse them in any other parameters, too, +# # but it might be hard to fix that, and it doesnt matter ATM +# text = self.replaceExternalLinks(text) +# holders.merge(self.replaceInternalLinks2(text)) +# # cloak any absolute URLs inside the image markup, so replaceExternalLinks() wont touch them +# s += prefix + self.armorLinks( +# self.makeImage(nt, text, holders)) + trail +# else: +# s += prefix + trail +# continue + +# if ns == NS_CATEGORY: +# s = rstrip(s + "\n") # bug 87 + +# if wasblank: +# sortkey = self.getDefaultSort() +# else: +# sortkey = text +# sortkey = Sanitizer::decodeCharReferences(sortkey) +# sortkey = str_replace("\n", '', sortkey) +# sortkey = self.getConverterLanguage().convertCategoryKey(sortkey) +# self.mOutput.addCategory(nt.getDBkey(), sortkey) + +# s += strip(prefix + trail, "\n") == '' ? '' : prefix + trail + +# continue +# } +# } + +# # Self-link checking. For some languages, variants of the title are checked in +# # LinkHolderArray::doVariants() to allow batching the existence checks necessary +# # for linking to a different variant. +# if ns != NS_SPECIAL and nt.equals(self.mTitle) and !nt.hasFragment(): +# s += prefix + Linker::makeSelfLinkObj(nt, text, '', trail) +# continue + +# # NS_MEDIA is a pseudo-namespace for linking directly to a file +# # @todo FIXME: Should do batch file existence checks, see comment below +# if ns == NS_MEDIA: +# # Give extensions a chance to select the file revision for us +# options = [] +# descQuery = False +# Hooks::run('BeforeParserFetchFileAndTitle', +# [this, nt, &options, &descQuery]) +# # Fetch and register the file (file title may be different via hooks) +# file, nt = self.fetchFileAndTitle(nt, options) +# # Cloak with NOPARSE to avoid replacement in replaceExternalLinks +# s += prefix + self.armorLinks( +# Linker::makeMediaLinkFile(nt, file, text)) + trail +# continue + +# # Some titles, such as valid special pages or files in foreign repos, should +# # be shown as bluelinks even though they are not included in the page table +# # +# # @todo FIXME: isAlwaysKnown() can be expensive for file links; we should really do +# # batch file existence checks for NS_FILE and NS_MEDIA +# if iw == '' and nt.isAlwaysKnown(): +# self.mOutput.addLink(nt) +# s += self.makeKnownLinkHolder(nt, text, array(), trail, prefix) +# else: +# # Links will be added to the output link list after checking +# s += holders.makeHolder(nt, text, array(), trail, prefix) +# } +# return holders + + +def makeInternalLink(title, label): + colon = title.find(':') + if colon > 0 and title[:colon] not in options.acceptedNamespaces: + return '' + if colon == 0: + # drop also :File: + colon2 = title.find(':', colon + 1) + if colon2 > 1 and title[colon + 1:colon2] not in options.acceptedNamespaces: + return '' + if options.keepLinks: + return '%s' % (quote(title.encode('utf-8')), label) + else: + return label + + +# ---------------------------------------------------------------------- +# External links + +# from: https://doc.wikimedia.org/mediawiki-core/master/php/DefaultSettings_8php_source.html + +wgUrlProtocols = [ + 'bitcoin:', 'ftp://', 'ftps://', 'geo:', 'git://', 'gopher://', 'http://', + 'https://', 'irc://', 'ircs://', 'magnet:', 'mailto:', 'mms://', 'news:', + 'nntp://', 'redis://', 'sftp://', 'sip:', 'sips:', 'sms:', 'ssh://', + 'svn://', 'tel:', 'telnet://', 'urn:', 'worldwind://', 'xmpp:', '//' +] + +# from: https://doc.wikimedia.org/mediawiki-core/master/php/Parser_8php_source.html + +# Constants needed for external link processing +# Everything except bracket, space, or control characters +# \p{Zs} is unicode 'separator, space' category. It covers the space 0x20 +# as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052 +EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]' +ANCHOR_CLASS = r'[^][\x00-\x08\x0a-\x1F]' +ExtLinkBracketedRegex = re.compile( + '\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)' + + r'\s*((?:' + ANCHOR_CLASS + r'|\[\[' + ANCHOR_CLASS + r'+\]\])' + r'*?)\]', + re.S | re.U) +# A simpler alternative: +# ExtLinkBracketedRegex = re.compile(r'\[(.*?)\](?!])') + +EXT_IMAGE_REGEX = re.compile( + r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+) + /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.((?i)gif|png|jpg|jpeg)$""", + re.X | re.S | re.U) + + +def replaceExternalLinks(text): + """ + https://www.mediawiki.org/wiki/Help:Links#External_links + [URL anchor text] + """ + s = '' + cur = 0 + for m in ExtLinkBracketedRegex.finditer(text): + s += text[cur:m.start()] + cur = m.end() + + url = m.group(1) + label = m.group(3) + + # # The characters '<' and '>' (which were escaped by + # # removeHTMLtags()) should not be included in + # # URLs, per RFC 2396. + # m2 = re.search('&(lt|gt);', url) + # if m2: + # link = url[m2.end():] + ' ' + link + # url = url[0:m2.end()] + + # If the link text is an image URL, replace it with an tag + # This happened by accident in the original parser, but some people used it extensively + m = EXT_IMAGE_REGEX.match(label) + if m: + label = makeExternalImage(label) + + # Use the encoded URL + # This means that users can paste URLs directly into the text + # Funny characters like ö aren't valid in URLs anyway + # This was changed in August 2004 + s += makeExternalLink(url, label) # + trail + + return s + text[cur:] + + +def makeExternalLink(url, anchor): + """Function applied to wikiLinks""" + if options.keepLinks: + return '%s' % (quote(url.encode('utf-8')), anchor) + else: + return anchor + + +def makeExternalImage(url, alt=''): + if options.keepLinks: + return '%s' % (url, alt) + else: + return alt + + +# ---------------------------------------------------------------------- + +# match tail after wikilink +tailRE = re.compile('\w+') + +syntaxhighlight = re.compile('<syntaxhighlight .*?>(.*?)</syntaxhighlight>', re.DOTALL) + +# skip level 1, it is page name level +section = re.compile(r'(==+)\s*(.*?)\s*\1') + +listOpen = {'*': '
    ', '#': '
      ', ';': '
      ', ':': '
      '} +listClose = {'*': '
', '#': '', ';': '', ':': ''} +listItem = {'*': '
  • %s
  • ', '#': '
  • %s', ';': '
    %s
    ', + ':': '
    %s
    '} + + +def compact(text): + """Deal with headers, lists, empty sections, residuals of tables. + :param text: convert to HTML. + """ + + page = [] # list of paragraph + headers = {} # Headers for unfilled sections + emptySection = False # empty sections are discarded + listLevel = [] # nesting of lists + listCount = [] # count of each list (it should be always in the same length of listLevel) + for line in text.split('\n'): + if not line: # collapse empty lines + # if there is an opening list, close it if we see an empty line + if len(listLevel): + page.append(line) + if options.toHTML: + for c in reversed(listLevel): + page.append(listClose[c]) + listLevel = [] + listCount = [] + emptySection = False + elif page and page[-1]: + page.append('') + continue + # Handle section titles + m = section.match(line) + if m: + title = m.group(2) + lev = len(m.group(1)) # header level + if options.toHTML: + page.append("%s" % (lev, title, lev)) + if title and title[-1] not in '!?': + title += '.' # terminate sentence. + headers[lev] = title + # drop previous headers + for i in list(headers.keys()): + if i > lev: + del headers[i] + emptySection = True + listLevel = [] + listCount = [] + continue + # Handle page title + elif line.startswith('++'): + title = line[2:-2] + if title: + if title[-1] not in '!?': + title += '.' + page.append(title) + # handle indents + elif line[0] == ':': + # page.append(line.lstrip(':*#;')) + continue + # handle lists + elif line[0] in '*#;:': + i = 0 + # c: current level char + # n: next level char + for c, n in zip_longest(listLevel, line, fillvalue=''): + if not n or n not in '*#;:': # shorter or different + if c: + if options.toHTML: + page.append(listClose[c]) + listLevel = listLevel[:-1] + listCount = listCount[:-1] + continue + else: + break + # n != '' + if c != n and (not c or (c not in ';:' and n not in ';:')): + if c: + # close level + if options.toHTML: + page.append(listClose[c]) + listLevel = listLevel[:-1] + listCount = listCount[:-1] + listLevel += n + listCount.append(0) + if options.toHTML: + page.append(listOpen[n]) + i += 1 + n = line[i - 1] # last list char + line = line[i:].strip() + if line: # FIXME: n is '"' + if options.keepLists: + if options.keepSections: + # emit open sections + items = sorted(headers.items()) + for _, v in items: + page.append("Section::::" + v) + headers.clear() + # use item count for #-lines + listCount[i - 1] += 1 + bullet = 'BULLET::::%d. ' % listCount[i - 1] if n == '#' else 'BULLET::::- ' + page.append('{0:{1}s}'.format(bullet, len(listLevel)) + line) + elif options.toHTML: + if n not in listItem: + n = '*' + page.append(listItem[n] % line) + elif len(listLevel): + if options.toHTML: + for c in reversed(listLevel): + page.append(listClose[c]) + listLevel = [] + listCount = [] + page.append(line) + + # Drop residuals of lists + elif line[0] in '{|' or line[-1] == '}': + continue + # Drop irrelevant lines + elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '': + continue + elif len(headers): + if options.keepSections: + items = sorted(headers.items()) + for i, v in items: + page.append("Section::::" + v) + headers.clear() + page.append(line) # first line + emptySection = False + elif not emptySection: + # Drop preformatted + if line[0] != ' ': # dangerous + page.append(line) + return page + + +def handle_unicode(entity): + numeric_code = int(entity[2:-1]) + if numeric_code >= 0x10000: return '' + return chr(numeric_code) + + +# ------------------------------------------------------------------------------ +# Output + + +class NextFile(object): + """ + Synchronous generation of next available file name. + """ + + filesPerDir = 100 + + def __init__(self, path_name): + self.path_name = path_name + self.dir_index = -1 + self.file_index = -1 + + def __next__(self): + self.file_index = (self.file_index + 1) % NextFile.filesPerDir + if self.file_index == 0: + self.dir_index += 1 + dirname = self._dirname() + if not os.path.isdir(dirname): + os.makedirs(dirname) + return self._filepath() + + next = __next__ + + def _dirname(self): + char1 = self.dir_index % 26 + char2 = self.dir_index // 26 % 26 + return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1)) + + def _filepath(self): + return '%s/wiki_%02d' % (self._dirname(), self.file_index) + + +class OutputSplitter(object): + """ + File-like object, that splits output to multiple files of a given max size. + """ + + def __init__(self, nextFile, max_file_size=0, compress=True): + """ + :param nextFile: a NextFile object from which to obtain filenames + to use. + :param max_file_size: the maximum size of each file. + :para compress: whether to write data with bzip compression. + """ + self.nextFile = nextFile + self.compress = compress + self.max_file_size = max_file_size + self.file = self.open(next(self.nextFile)) + + def reserve(self, size): + if self.file.tell() + size > self.max_file_size: + self.close() + self.file = self.open(next(self.nextFile)) + + def write(self, data): + self.reserve(len(data)) + self.file.write(data) + + def close(self): + self.file.close() + + def open(self, filename): + if self.compress: + return bz2.BZ2File(filename + '.bz2', 'w') + else: + return open(filename, 'wb') + + +# ---------------------------------------------------------------------- +# READER + +tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*?>(?:([^<]*)(<.*?>)?)?') +# 1 2 3 4 +keyRE = re.compile(r'key="(\d*)"') +catRE = re.compile(r'\[\[Category:([^\|]+).*\]\].*') # capture the category name [[Category:Category name|Sortkey]]" + +def load_templates(file, output_file=None): + """ + Load templates from :param file:. + :param output_file: file where to save templates and modules. + """ + options.templatePrefix = options.templateNamespace + ':' + options.modulePrefix = options.moduleNamespace + ':' + + if output_file: + output = codecs.open(output_file, 'wb', 'utf-8') + for page_count, page_data in enumerate(pages_from(file)): + id, revid, title, ns,catSet, page = page_data + if not output_file and (not options.templateNamespace or + not options.moduleNamespace): # do not know it yet + # reconstruct templateNamespace and moduleNamespace from the first title + if ns in templateKeys: + colon = title.find(':') + if colon > 1: + if ns == '10': + options.templateNamespace = title[:colon] + options.templatePrefix = title[:colon + 1] + elif ns == '828': + options.moduleNamespace = title[:colon] + options.modulePrefix = title[:colon + 1] + if ns in templateKeys: + text = ''.join(page) + define_template(title, text) + # save templates and modules to file + if output_file: + output.write('\n') + output.write(' %s\n' % title) + output.write(' %s\n' % ns) + output.write(' %s\n' % id) + output.write(' ') + for line in page: + output.write(line) + output.write(' \n') + output.write('\n') + if page_count and page_count % 100000 == 0: + logging.info("Preprocessed %d pages", page_count) + if output_file: + output.close() + logging.info("Saved %d templates to '%s'", len(options.templates), output_file) + + +def pages_from(input): + """ + Scans input extracting pages. + :return: (id, revid, title, namespace key, page), page is a list of lines. + """ + # we collect individual lines, since str.join() is significantly faster + # than concatenation + page = [] + id = None + ns = '0' + last_id = None + revid = None + inText = False + redirect = False + title = None + for line in input: + if not isinstance(line, text_type): line = line.decode('utf-8') + if '<' not in line: # faster than doing re.search() + if inText: + page.append(line) + # extract categories + if line.lstrip().startswith('[[Category:'): + mCat = catRE.search(line) + if mCat: + catSet.add(mCat.group(1)) + continue + m = tagRE.search(line) + if not m: + continue + tag = m.group(2) + if tag == 'page': + page = [] + catSet = set() + redirect = False + elif tag == 'id' and not id: + id = m.group(3) + elif tag == 'id' and id: + revid = m.group(3) + elif tag == 'title': + title = m.group(3) + elif tag == 'ns': + ns = m.group(3) + elif tag == 'redirect': + redirect = True + elif tag == 'text': + if m.lastindex == 3 and line[m.start(3)-2] == '/': # self closing + # + continue + inText = True + line = line[m.start(3):m.end(3)] + page.append(line) + if m.lastindex == 4: # open-close + inText = False + elif tag == '/text': + if m.group(1): + page.append(m.group(1)) + inText = False + elif inText: + page.append(line) + elif tag == '/page': + if id != last_id and not redirect: + yield (id, revid, title, ns,catSet, page) + last_id = id + ns = '0' + id = None + revid = None + title = None + page = [] + + +def process_dump(input_file, template_file, out_file, file_size, file_compress, + process_count): + """ + :param input_file: name of the wikipedia dump file; '-' to read from stdin + :param template_file: optional file with template definitions. + :param out_file: directory where to store extracted data, or '-' for stdout + :param file_size: max size of each extracted file, or None for no max (one file) + :param file_compress: whether to compress files with bzip. + :param process_count: number of extraction processes to spawn. + """ + + if input_file == '-': + input = sys.stdin + else: + input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) + + # collect siteinfo + for line in input: + # When an input file is .bz2 or .gz, line can be a bytes even in Python 3. + if not isinstance(line, text_type): line = line.decode('utf-8') + m = tagRE.search(line) + if not m: + continue + tag = m.group(2) + if tag == 'base': + # discover urlbase from the xml dump file + # /mediawiki/siteinfo/base + base = m.group(3) + options.urlbase = base[:base.rfind("/")] + elif tag == 'namespace': + mk = keyRE.search(line) + if mk: + nsid = ''.join(mk.groups()) + else: + nsid = '' + options.knownNamespaces[m.group(3)] = nsid + if re.search('key="10"', line): + options.templateNamespace = m.group(3) + options.templatePrefix = options.templateNamespace + ':' + elif re.search('key="828"', line): + options.moduleNamespace = m.group(3) + options.modulePrefix = options.moduleNamespace + ':' + elif tag == '/siteinfo': + break + + if options.expand_templates: + # preprocess + template_load_start = default_timer() + if template_file: + if os.path.exists(template_file): + logging.info("Loading template definitions from: %s", template_file) + # can't use with here: + file = fileinput.FileInput(template_file, + openhook=fileinput.hook_compressed) + load_templates(file) + file.close() + else: + if input_file == '-': + # can't scan then reset stdin; must error w/ suggestion to specify template_file + raise ValueError("to use templates with stdin dump, must supply explicit template-file") + logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file) + load_templates(input, template_file) + input.close() + input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) + template_load_elapsed = default_timer() - template_load_start + logging.info("Loaded %d templates in %.1fs", len(options.templates), template_load_elapsed) + + # process pages + logging.info("Starting page extraction from %s.", input_file) + extract_start = default_timer() + + # Parallel Map/Reduce: + # - pages to be processed are dispatched to workers + # - a reduce process collects the results, sort them and print them. + + process_count = max(1, process_count) + maxsize = 10 * process_count + # output queue + output_queue = Queue(maxsize=maxsize) + + if out_file == '-': + out_file = None + + worker_count = process_count + + # load balancing + max_spool_length = 10000 + spool_length = Value('i', 0, lock=False) + + # reduce job that sorts and prints output + reduce = Process(target=reduce_process, + args=(options, output_queue, spool_length, + out_file, file_size, file_compress)) + reduce.start() + + # initialize jobs queue + jobs_queue = Queue(maxsize=maxsize) + + # start worker processes + logging.info("Using %d extract processes.", worker_count) + workers = [] + for i in range(worker_count): + extractor = Process(target=extract_process, + args=(options, i, jobs_queue, output_queue)) + extractor.daemon = True # only live while parent process lives + extractor.start() + workers.append(extractor) + + # Mapper process + page_num = 0 + for page_data in pages_from(input): + id, revid, title, ns, catSet, page = page_data + if keepPage(ns, catSet, page): + # slow down + delay = 0 + if spool_length.value > max_spool_length: + # reduce to 10% + while spool_length.value > max_spool_length/10: + time.sleep(10) + delay += 10 + if delay: + logging.info('Delay %ds', delay) + job = (id, revid, title, page, page_num) + jobs_queue.put(job) # goes to any available extract_process + page_num += 1 + page = None # free memory + + input.close() + + # signal termination + for _ in workers: + jobs_queue.put(None) + # wait for workers to terminate + for w in workers: + w.join() + + # signal end of work to reduce process + output_queue.put(None) + # wait for it to finish + reduce.join() + + extract_duration = default_timer() - extract_start + extract_rate = page_num / extract_duration + logging.info("Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)", + process_count, page_num, extract_duration, extract_rate) + logging.info("total of page: %d, total of articl page: %d; total of used articl page: %d" % (g_page_total, g_page_articl_total,g_page_articl_used_total)) + + +# ---------------------------------------------------------------------- +# Multiprocess support + + +def extract_process(opts, i, jobs_queue, output_queue): + """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text + :param i: process id. + :param jobs_queue: where to get jobs. + :param output_queue: where to queue extracted text for output. + """ + + global options + options = opts + + createLogger(options.quiet, options.debug, options.log_file) + + out = StringIO() # memory buffer + + + while True: + job = jobs_queue.get() # job is (id, title, page, page_num) + if job: + id, revid, title, page, page_num = job + try: + e = Extractor(*job[:4]) # (id, revid, title, page) + page = None # free memory + e.extract(out) + text = out.getvalue() + except: + text = '' + logging.exception('Processing page: %s %s', id, title) + + output_queue.put((page_num, text)) + out.truncate(0) + out.seek(0) + else: + logging.debug('Quit extractor') + break + out.close() + + +report_period = 10000 # progress report period +def reduce_process(opts, output_queue, spool_length, + out_file=None, file_size=0, file_compress=True): + """Pull finished article text, write series of files (or stdout) + :param opts: global parameters. + :param output_queue: text to be output. + :param spool_length: spool length. + :param out_file: filename where to print. + :param file_size: max file size. + :param file_compress: whether to compress output. + """ + + global options + options = opts + + createLogger(options.quiet, options.debug, options.log_file) + + if out_file: + nextFile = NextFile(out_file) + output = OutputSplitter(nextFile, file_size, file_compress) + else: + output = sys.stdout if PY2 else sys.stdout.buffer + if file_compress: + logging.warn("writing to stdout, so no output compression (use an external tool)") + + interval_start = default_timer() + # FIXME: use a heap + spool = {} # collected pages + next_page = 0 # sequence numbering of page + while True: + if next_page in spool: + output.write(spool.pop(next_page).encode('utf-8')) + next_page += 1 + # tell mapper our load: + spool_length.value = len(spool) + # progress report + if next_page % report_period == 0: + interval_rate = report_period / (default_timer() - interval_start) + logging.info("Extracted %d articles (%.1f art/s)", + next_page, interval_rate) + interval_start = default_timer() + else: + # mapper puts None to signal finish + pair = output_queue.get() + if not pair: + break + page_num, text = pair + spool[page_num] = text + # tell mapper our load: + spool_length.value = len(spool) + # FIXME: if an extractor dies, process stalls; the other processes + # continue to produce pairs, filling up memory. + if len(spool) > 200: + logging.debug('Collected %d, waiting: %d, %d', len(spool), + next_page, next_page == page_num) + if output != sys.stdout: + output.close() + + +# ---------------------------------------------------------------------- + +# Minimum size of output files +minFileSize = 200 * 1024 + +def main(): + + parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), + formatter_class=argparse.RawDescriptionHelpFormatter, + description=__doc__) + parser.add_argument("input", + help="XML wiki dump file") + groupO = parser.add_argument_group('Output') + groupO.add_argument("-o", "--output", default="text", + help="directory for extracted files (or '-' for dumping to stdout)") + groupO.add_argument("-b", "--bytes", default="1M", + help="maximum bytes per output file (default %(default)s)", + metavar="n[KMG]") + groupO.add_argument("-c", "--compress", action="store_true", + help="compress output files using bzip") + groupO.add_argument("--json", action="store_true", + help="write output in json format instead of the default one") + + + groupP = parser.add_argument_group('Processing') + groupP.add_argument("--html", action="store_true", + help="produce HTML output, subsumes --links") + groupP.add_argument("-l", "--links", action="store_true", + help="preserve links") + groupP.add_argument("-s", "--sections", action="store_true", + help="preserve sections") + groupP.add_argument("--lists", action="store_true", + help="preserve lists") + groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2", + help="accepted namespaces in links") + groupP.add_argument("--templates", + help="use or create file containing templates") + groupP.add_argument("--no_templates", action="store_false", + help="Do not expand templates") + groupP.add_argument("-r", "--revision", action="store_true", default=options.print_revision, + help="Include the document revision id (default=%(default)s)") + groupP.add_argument("--min_text_length", type=int, default=options.min_text_length, + help="Minimum expanded text length required to write document (default=%(default)s)") + groupP.add_argument("--filter_disambig_pages", action="store_true", default=options.filter_disambig_pages, + help="Remove pages from output that contain disabmiguation markup (default=%(default)s)") + groupP.add_argument("-it", "--ignored_tags", default="", metavar="abbr,b,big", + help="comma separated list of tags that will be dropped, keeping their content") + groupP.add_argument("-de", "--discard_elements", default="", metavar="gallery,timeline,noinclude", + help="comma separated list of elements that will be removed from the article text") + groupP.add_argument("--keep_tables", action="store_true", default=options.keep_tables, + help="Preserve tables in the output article text (default=%(default)s)") + default_process_count = max(1, cpu_count() - 1) + parser.add_argument("--processes", type=int, default=default_process_count, + help="Number of processes to use (default %(default)s)") + + groupS = parser.add_argument_group('Special') + groupS.add_argument("-q", "--quiet", action="store_true", + help="suppress reporting progress info") + groupS.add_argument("--debug", action="store_true", + help="print debug info") + groupS.add_argument("-a", "--article", action="store_true", + help="analyze a file containing a single article (debug option)") + groupS.add_argument("--log_file", + help="path to save the log info") + groupS.add_argument("-v", "--version", action="version", + version='%(prog)s ' + version, + help="print program version") + groupP.add_argument("--filter_category", + help="specify the file that listing the Categories you want to include or exclude. One line for" + " one category. starting with: 1) '#' comment, ignored; 2) '^' exclude; Note: excluding has higher priority than including") + args = parser.parse_args() + + options.keepLinks = args.links + options.keepSections = args.sections + options.keepLists = args.lists + options.toHTML = args.html + options.write_json = args.json + options.print_revision = args.revision + options.min_text_length = args.min_text_length + if args.html: + options.keepLinks = True + + options.expand_templates = args.no_templates + options.filter_disambig_pages = args.filter_disambig_pages + options.keep_tables = args.keep_tables + + try: + power = 'kmg'.find(args.bytes[-1].lower()) + 1 + file_size = int(args.bytes[:-1]) * 1024 ** power + if file_size < minFileSize: + raise ValueError() + except ValueError: + logging.error('Insufficient or invalid size: %s', args.bytes) + return + + if args.namespaces: + options.acceptedNamespaces = set(args.namespaces.split(',')) + + # ignoredTags and discardElemets have default values already supplied, if passed in the defaults are overwritten + if args.ignored_tags: + ignoredTags = set(args.ignored_tags.split(',')) + else: + ignoredTags = [ + 'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em', + 'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd', + 'p', 'plaintext', 's', 'span', 'strike', 'strong', + 'tt', 'u', 'var' + ] + + # 'a' tag is handled separately + for tag in ignoredTags: + ignoreTag(tag) + + if args.discard_elements: + options.discardElements = set(args.discard_elements.split(',')) + + FORMAT = '%(levelname)s: %(message)s' + logging.basicConfig(format=FORMAT) + + options.quiet = args.quiet + options.debug = args.debug + options.log_file = args.log_file + createLogger(options.quiet, options.debug, options.log_file) + + input_file = args.input + + if not options.keepLinks: + ignoreTag('a') + + # sharing cache of parser templates is too slow: + # manager = Manager() + # templateCache = manager.dict() + + if args.article: + if args.templates: + if os.path.exists(args.templates): + with open(args.templates) as file: + load_templates(file) + + file = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) + for page_data in pages_from(file): + id, revid, title, ns,catSet, page = page_data + Extractor(id, revid, title, page).extract(sys.stdout) + file.close() + return + + output_path = args.output + if output_path != '-' and not os.path.isdir(output_path): + try: + os.makedirs(output_path) + except: + logging.error('Could not create: %s', output_path) + return + + filter_category = args.filter_category + if (filter_category != None and len(filter_category)>0): + with open(filter_category) as f: + error_cnt = 0 + for line in f.readlines(): + try: + line = str(line.strip()) + if line.startswith('#') or len(line) == 0: + continue; + elif line.startswith('^'): + options.filter_category_exclude.add(line.lstrip('^')) + else: + options.filter_category_include.add(line) + except Exception as e: + error_cnt += 1 + print(u"Category not in utf8, ignored. error cnt %d:\t%s" % (error_cnt,e)) + print(line) + logging.info("Excluding categories:",) + logging.info(str(options.filter_category_exclude)) + logging.info("Including categories:") + logging.info(str(len(options.filter_category_include))) + + process_dump(input_file, args.templates, output_path, file_size, + args.compress, args.processes) + +def createLogger(quiet, debug, log_file): + logger = logging.getLogger() + if not quiet: + logger.setLevel(logging.INFO) + if debug: + logger.setLevel(logging.DEBUG) + #print (log_file) + if log_file: + fileHandler = logging.FileHandler(log_file) + logger.addHandler(fileHandler) + +if __name__ == '__main__': + main() diff --git a/helpers/parse_wiki.py b/helpers/parse_wiki.py new file mode 100644 index 0000000..385bc5e --- /dev/null +++ b/helpers/parse_wiki.py @@ -0,0 +1,195 @@ +import requests +from bs4 import BeautifulSoup +import os +import getpass +import xml.sax +from tensorflow.python.keras.utils.data_utils import get_file +import subprocess +import unidecode as ud +import re + + +def list_files(directory): + r = [] + for root, dirs, wikifiles in os.walk(directory): + for name in wikifiles: + r.append(os.path.join(root, name)) + return r + + +def is_ascii(s): + return all(ord(c) < 128 for c in s) + + +def replace_special_characters(strr): + strr = strr.replace("'s", "") + strr = strr.replace("l'", "") + strr = strr.replace("L'", "") + + characters_to_replace = "ẞß" + for character in characters_to_replace: + strr = strr.replace(character, "ss") + + characters_to_remove = "„“”«»‚!~`,’'.:;()[]{}=|\@<>@#$%^&*-_+?\"" + for character in characters_to_remove: + strr = strr.replace(character, "") + + strr = ud.unidecode(strr) + lst = re.split('[/-]', strr) + return lst + + +def is_title_line(title_line): + if title_line.find("id=") >= 0 and title_line.find("url=") >= 0 and title_line.find("title=") >= 0: + return True + return False + + +class WikiXmlHandler(xml.sax.handler.ContentHandler): + """Content handler for Wiki XML data using SAX""" + def __init__(self): + xml.sax.handler.ContentHandler.__init__(self) + self._buffer = None + self._values = {} + self._current_tag = None + self._pages = [] + + def characters(self, content): + """Characters between opening and closing tags""" + if self._current_tag: + self._buffer.append(content) + + def startElement(self, name, attrs): + """Opening tag of element""" + if name in ('title', 'text', 'timestamp'): + self._current_tag = name + self._buffer = [] + + def endElement(self, name): + """Closing tag of element""" + if name == self._current_tag: + self._values[name] = ' '.join(self._buffer) + + if name == 'page': + self._pages.append((self._values['title'], self._values['text'])) + + +username = getpass.getuser() +keras_home = '/home/' + username + '/.keras/datasets/' + +# Downloading the wikipedia dumps +wikis = ["frwiki", "eswiki", "dewiki"] + +for wiki in wikis: + base_url = 'https://dumps.wikimedia.org/' + wiki + '/' + index = requests.get(base_url).text + soup_index = BeautifulSoup(index, 'html.parser') + + # Find the links that are dates of dumps + dumps = [a['href'] for a in soup_index.find_all('a') if a.has_attr('href')] + + # Finds the html content of the page for the dump made on 07-August-2020 + dump_url = base_url + 'latest/' + + # Retrieve the html + dump_html = requests.get(dump_url).text + + # Convert to a soup + soup_dump = BeautifulSoup(dump_html, 'html.parser') + files = [] + + # Search through all files + for file in soup_dump.find_all('a'): + if file.has_attr('href'): + text = file['href'] + # Select the relevant files + if 'pages-articles' in text and 'multistream' not in text and text.endswith('.bz2'): + files.append((text.split()[0], text.split()[1:])) + + files_to_download = [file[0] for file in files if '.xml-p' in file[0]] + + data_paths = [] + file_info = [] + + if not os.path.exists(keras_home): + os.makedirs(keras_home) + + # Iterate through each file + for file in files_to_download: + path = keras_home + file + # Check to see if the path exists (if the file is already downloaded) + if not os.path.exists(path): + data_paths.append(get_file(file, dump_url + file)) + # Find the file size in MB + file_size = os.stat(path).st_size / 1e6 + # Find the number of articles + file_articles = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2]) + file_info.append((file, file_size, file_articles)) + + # If the file is already downloaded find some information + else: + data_paths.append(path) + # Find the file size in MB + file_size = os.stat(path).st_size / 1e6 + # Find the number of articles + file_number = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2]) + file_info.append((file.split('-')[-1], file_size, file_number)) + + +# Create the dictionaries by extracting the words from the wiki articles +for wiki in wikis: + # titles = dictionary containing the titles from the wiki pages + ft = open("titles-" + wiki + ".txt", "w") + + # Create a dictionary containing the worlds extracted from the wiki pages + fp = open(wiki + ".txt", "w") + + for filename in os.listdir(keras_home): + if not (filename.startswith(wiki) and filename.endswith(".bz2")): + continue + + # e.g ~/.keras/datasets/dewiki = directory containing folders with parsed articles' content + if not os.path.exists(keras_home + wiki): + os.makedirs(keras_home + wiki) + + # Extract page content (text only) + subprocess.run(["./WikiExtractor.py", "-o", keras_home + wiki, keras_home + filename]) + + # Extract the words from the articles' content and write them in the dictionary + for file in list_files(keras_home + wiki): + with open(file, 'r') as f: + for line in f: + if line.find("") >= 0: + continue + + if is_title_line(line): + for word in line.split(): + if word.startswith("title="): + words = replace_special_characters(word[7:-2]) + for wd in words: + if len(wd) > 3 and not wd.isdecimal(): + ft.write(wd + "\n") + else: + for word in line.split(): + words = replace_special_characters(word) + for wd in words: + if len(wd) > 3 and not wd.isdecimal(): + fp.write(wd + "\n") + + subprocess.run(["rm", "-r", keras_home + wiki]) + + ft.close() + fp.close() + + # Create dictionary (unique words sorted by frequency) + files_to_process = [wiki + ".txt", "titles-" + wiki + ".txt"] + + for file in files_to_process: + if os.path.exists("./" + file): + log = open('../backend/static/crack/' + file, 'w') + p1 = subprocess.Popen(["sort", file], stdout=subprocess.PIPE) + p2 = subprocess.Popen(["uniq", "-c"], stdin=p1.stdout, stdout=subprocess.PIPE) + p3 = subprocess.Popen(["sort", "-r", "-n", "-s", "-k1,1"], stdin=p2.stdout, stdout=subprocess.PIPE) + p4 = subprocess.Popen(["awk", '{print $2}'], stdin=p3.stdout, stdout=log) + log.close() + subprocess.run(["rm", file])