diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c678a5e --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..18c24a9 --- /dev/null +++ b/Pipfile @@ -0,0 +1,12 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +requests-html = "*" + +[dev-packages] + +[requires] +python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..8c311ec --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,232 @@ +{ + "_meta": { + "hash": { + "sha256": "992b7fd81898db822d0416144b6720a0f5f3801c56ded482ae0f796a208dd988" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.9" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "appdirs": { + "hashes": [ + "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41", + "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128" + ], + "version": "==1.4.4" + }, + "beautifulsoup4": { + "hashes": [ + "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35", + "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25", + "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666" + ], + "version": "==4.9.3" + }, + "bs4": { + "hashes": [ + "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a" + ], + "version": "==0.0.1" + }, + "certifi": { + "hashes": [ + "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c", + "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830" + ], + "version": "==2020.12.5" + }, + "chardet": { + "hashes": [ + "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa", + "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==4.0.0" + }, + "cssselect": { + "hashes": [ + "sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf", + "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.1.0" + }, + "fake-useragent": { + "hashes": [ + "sha256:c104998b750eb097eefc28ae28e92d66397598d2cf41a31aa45d5559ef1adf35" + ], + "version": "==0.1.11" + }, + "idna": { + "hashes": [ + "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6", + "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.10" + }, + "lxml": { + "hashes": [ + "sha256:079f3ae844f38982d156efce585bc540c16a926d4436712cf4baee0cce487a3d", + "sha256:0fbcf5565ac01dff87cbfc0ff323515c823081c5777a9fc7703ff58388c258c3", + "sha256:122fba10466c7bd4178b07dba427aa516286b846b2cbd6f6169141917283aae2", + "sha256:1b7584d421d254ab86d4f0b13ec662a9014397678a7c4265a02a6d7c2b18a75f", + "sha256:26e761ab5b07adf5f555ee82fb4bfc35bf93750499c6c7614bd64d12aaa67927", + "sha256:289e9ca1a9287f08daaf796d96e06cb2bc2958891d7911ac7cae1c5f9e1e0ee3", + "sha256:2a9d50e69aac3ebee695424f7dbd7b8c6d6eb7de2a2eb6b0f6c7db6aa41e02b7", + "sha256:33bb934a044cf32157c12bfcfbb6649807da20aa92c062ef51903415c704704f", + "sha256:3439c71103ef0e904ea0a1901611863e51f50b5cd5e8654a151740fde5e1cade", + "sha256:39b78571b3b30645ac77b95f7c69d1bffc4cf8c3b157c435a34da72e78c82468", + "sha256:4289728b5e2000a4ad4ab8da6e1db2e093c63c08bdc0414799ee776a3f78da4b", + "sha256:4bff24dfeea62f2e56f5bab929b4428ae6caba2d1eea0c2d6eb618e30a71e6d4", + "sha256:542d454665a3e277f76954418124d67516c5f88e51a900365ed54a9806122b83", + "sha256:5a0a14e264069c03e46f926be0d8919f4105c1623d620e7ec0e612a2e9bf1c04", + "sha256:66e575c62792c3f9ca47cb8b6fab9e35bab91360c783d1606f758761810c9791", + "sha256:74f7d8d439b18fa4c385f3f5dfd11144bb87c1da034a466c5b5577d23a1d9b51", + "sha256:7610b8c31688f0b1be0ef882889817939490a36d0ee880ea562a4e1399c447a1", + "sha256:76fa7b1362d19f8fbd3e75fe2fb7c79359b0af8747e6f7141c338f0bee2f871a", + "sha256:7728e05c35412ba36d3e9795ae8995e3c86958179c9770e65558ec3fdfd3724f", + "sha256:8157dadbb09a34a6bd95a50690595e1fa0af1a99445e2744110e3dca7831c4ee", + "sha256:820628b7b3135403540202e60551e741f9b6d3304371712521be939470b454ec", + "sha256:884ab9b29feaca361f7f88d811b1eea9bfca36cf3da27768d28ad45c3ee6f969", + "sha256:89b8b22a5ff72d89d48d0e62abb14340d9e99fd637d046c27b8b257a01ffbe28", + "sha256:92e821e43ad382332eade6812e298dc9701c75fe289f2a2d39c7960b43d1e92a", + "sha256:b007cbb845b28db4fb8b6a5cdcbf65bacb16a8bd328b53cbc0698688a68e1caa", + "sha256:bc4313cbeb0e7a416a488d72f9680fffffc645f8a838bd2193809881c67dd106", + "sha256:bccbfc27563652de7dc9bdc595cb25e90b59c5f8e23e806ed0fd623755b6565d", + "sha256:c4f05c5a7c49d2fb70223d0d5bcfbe474cf928310ac9fa6a7c6dddc831d0b1d4", + "sha256:ce256aaa50f6cc9a649c51be3cd4ff142d67295bfc4f490c9134d0f9f6d58ef0", + "sha256:d2e35d7bf1c1ac8c538f88d26b396e73dd81440d59c1ef8522e1ea77b345ede4", + "sha256:df7c53783a46febb0e70f6b05df2ba104610f2fb0d27023409734a3ecbb78fb2", + "sha256:efac139c3f0bf4f0939f9375af4b02c5ad83a622de52d6dfa8e438e8e01d0eb0", + "sha256:efd7a09678fd8b53117f6bae4fa3825e0a22b03ef0a932e070c0bdbb3a35e654", + "sha256:f2380a6376dfa090227b663f9678150ef27543483055cc327555fb592c5967e2", + "sha256:f8380c03e45cf09f8557bdaa41e1fa7c81f3ae22828e1db470ab2a6c96d8bc23", + "sha256:f90ba11136bfdd25cae3951af8da2e95121c9b9b93727b1b896e3fa105b2f586" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==4.6.3" + }, + "parse": { + "hashes": [ + "sha256:9ff82852bcb65d139813e2a5197627a94966245c897796760a3a2a8eb66f020b" + ], + "version": "==1.19.0" + }, + "pyee": { + "hashes": [ + "sha256:383973b63ad7ed5e3c0311f8b179c52981f9e7b3eaea0e9a830d13ec34dde65f", + "sha256:92dacc5bd2bdb8f95aa8dd2585d47ca1c4840e2adb95ccf90034d64f725bfd31" + ], + "version": "==8.1.0" + }, + "pyppeteer": { + "hashes": [ + "sha256:c2974be1afa13b17f7ecd120d265d8b8cd324d536a231c3953ca872b68aba4af", + "sha256:d4cb4a5ef94b00c1073aed888b39646ce26cff3339cff7a3f1f1cc307bf50408" + ], + "markers": "python_full_version >= '3.6.1' and python_full_version < '4.0.0'", + "version": "==0.2.5" + }, + "pyquery": { + "hashes": [ + "sha256:1fc33b7699455ed25c75282bc8f80ace1ac078b0dda5a933dacbd8b1c1f83963", + "sha256:a388eefb6bc4a55350de0316fbd97cda999ae669b6743ae5b99102ba54f5aa72" + ], + "version": "==1.4.3" + }, + "requests": { + "hashes": [ + "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804", + "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==2.25.1" + }, + "requests-html": { + "hashes": [ + "sha256:7e929ecfed95fb1d0994bb368295d6d7c4d06b03fcb900c33d7d0b17e6003947", + "sha256:cb8a78cf829c4eca9d6233f28524f65dd2bfaafb4bdbbc407f0a0b8f487df6e2" + ], + "index": "pypi", + "version": "==0.10.0" + }, + "six": { + "hashes": [ + "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259", + "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "version": "==1.15.0" + }, + "soupsieve": { + "hashes": [ + "sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc", + "sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b" + ], + "markers": "python_version >= '3.0'", + "version": "==2.2.1" + }, + "tqdm": { + "hashes": [ + "sha256:daec693491c52e9498632dfbe9ccfc4882a557f5fa08982db1b4d3adbe0887c3", + "sha256:ebdebdb95e3477ceea267decfc0784859aa3df3e27e22d23b83e9b272bf157ae" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==4.60.0" + }, + "urllib3": { + "hashes": [ + "sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df", + "sha256:e7b021f7241115872f92f43c6508082facffbd1c048e3c6e2bb9c2a157e28937" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_full_version < '4.0.0'", + "version": "==1.26.4" + }, + "w3lib": { + "hashes": [ + "sha256:0161d55537063e00d95a241663ede3395c4c6d7b777972ba2fd58bbab2001e53", + "sha256:0ad6d0203157d61149fd45aaed2e24f53902989c32fc1dccc2e2bfba371560df" + ], + "version": "==1.22.0" + }, + "websockets": { + "hashes": [ + "sha256:0e4fb4de42701340bd2353bb2eee45314651caa6ccee80dbd5f5d5978888fed5", + "sha256:1d3f1bf059d04a4e0eb4985a887d49195e15ebabc42364f4eb564b1d065793f5", + "sha256:20891f0dddade307ffddf593c733a3fdb6b83e6f9eef85908113e628fa5a8308", + "sha256:295359a2cc78736737dd88c343cd0747546b2174b5e1adc223824bcaf3e164cb", + "sha256:2db62a9142e88535038a6bcfea70ef9447696ea77891aebb730a333a51ed559a", + "sha256:3762791ab8b38948f0c4d281c8b2ddfa99b7e510e46bd8dfa942a5fff621068c", + "sha256:3db87421956f1b0779a7564915875ba774295cc86e81bc671631379371af1170", + "sha256:3ef56fcc7b1ff90de46ccd5a687bbd13a3180132268c4254fc0fa44ecf4fc422", + "sha256:4f9f7d28ce1d8f1295717c2c25b732c2bc0645db3215cf757551c392177d7cb8", + "sha256:5c01fd846263a75bc8a2b9542606927cfad57e7282965d96b93c387622487485", + "sha256:5c65d2da8c6bce0fca2528f69f44b2f977e06954c8512a952222cea50dad430f", + "sha256:751a556205d8245ff94aeef23546a1113b1dd4f6e4d102ded66c39b99c2ce6c8", + "sha256:7ff46d441db78241f4c6c27b3868c9ae71473fe03341340d2dfdbe8d79310acc", + "sha256:965889d9f0e2a75edd81a07592d0ced54daa5b0785f57dc429c378edbcffe779", + "sha256:9b248ba3dd8a03b1a10b19efe7d4f7fa41d158fdaa95e2cf65af5a7b95a4f989", + "sha256:9bef37ee224e104a413f0780e29adb3e514a5b698aabe0d969a6ba426b8435d1", + "sha256:c1ec8db4fac31850286b7cd3b9c0e1b944204668b8eb721674916d4e28744092", + "sha256:c8a116feafdb1f84607cb3b14aa1418424ae71fee131642fc568d21423b51824", + "sha256:ce85b06a10fc65e6143518b96d3dca27b081a740bae261c2fb20375801a9d56d", + "sha256:d705f8aeecdf3262379644e4b55107a3b55860eb812b673b28d0fbc347a60c55", + "sha256:e898a0863421650f0bebac8ba40840fc02258ef4714cb7e1fd76b6a6354bda36", + "sha256:f8a7bff6e8664afc4e6c28b983845c5bc14965030e3fb98789734d416af77c4b" + ], + "markers": "python_full_version >= '3.6.1'", + "version": "==8.1" + } + }, + "develop": {} +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..d05be9d --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +# LowestCommonAncestorExtractor + +A python library for the structured extraction of content from German and English Terms and Conditions. Developed by [Tobias Schamel](https://wwwmatthes.in.tum.de/pages/665u6pdbc45i/Bachelor-s-Thesis-Tobias-Schamel) as part of the [AGB-Check](project) project. + +For citation, please use: +``` +@InProceedings{schamel-EtAl:2022:ECNLP, + author = {Schamel, Tobias and Braun, Daniel and Matthes, Florian}, + title = {Structured Extraction of Terms and Conditions from German and English Online Shops}, + booktitle = {Proceedings of The Fifth Workshop on e-Commerce and NLP (ECNLP 5)}, + month = {May}, + year = {2022}, + address = {Dublin, Ireland}, + publisher = {Association for Computational Linguistics} +} + +``` + +## License + +The software is provided under the MIT license. + +## Acknowledgements + +The project was supported by funds of the Federal Ministry for the Environment, Nature Conservation, +Nuclear Safety and Consumer Protection (BMUV) based on a decision of the Parliament of the Federal +Republic of Germany via the Federal Office for Agriculture and Food (BLE) under the innovation +support programme. + diff --git a/src/ContentExtractor/ContentExtractor.py b/src/ContentExtractor/ContentExtractor.py new file mode 100644 index 0000000..a0bcdc1 --- /dev/null +++ b/src/ContentExtractor/ContentExtractor.py @@ -0,0 +1,17 @@ +from src.ContentExtractor.TreeUtilities import getFrequencyOfStyles, getLowestCommonAncestorNodeOfStyle + +# Enum for different content extractors presented in the thesis. +from src.DOMParser.DOMParser import cleanMainContent + + +# Extracting the main content using the method presented in the thesis. +# Attributes: body = body HTML node; contentExtractor = content extraction method; threshold = minimum coverage for +# main content node. +def getMainContent(bodyDOMNode, contentExtractor, threshold): + dic = getFrequencyOfStyles(bodyDOMNode, contentExtractor) + max_key = max(dic, key=dic.get) + mainContent = getLowestCommonAncestorNodeOfStyle(bodyDOMNode, max_key, dic[max_key], threshold, contentExtractor) + + # Clean main content from empty nodes and other + cleanMainContent(mainContent) + return mainContent diff --git a/src/ContentExtractor/ContentExtractorTypes.py b/src/ContentExtractor/ContentExtractorTypes.py new file mode 100644 index 0000000..0cdab9a --- /dev/null +++ b/src/ContentExtractor/ContentExtractorTypes.py @@ -0,0 +1,8 @@ +from enum import Enum + +# Different types of content extractors presented in Section 4.4. +class ContentExtractor(Enum): + NaiveStyle = 1 + RenderedStyle = 2 + NaiveStyleAndShortTextExclusion = 3 + RenderedStyleAndShortTextExclusion = 4 diff --git a/src/ContentExtractor/TreeUtilities.py b/src/ContentExtractor/TreeUtilities.py new file mode 100644 index 0000000..8ea1fe5 --- /dev/null +++ b/src/ContentExtractor/TreeUtilities.py @@ -0,0 +1,169 @@ +from src.ContentExtractor.ContentExtractorTypes import ContentExtractor +from src.DOMParser.DOMNode import TextElement, TreeElement +from src.ContentExtractor.ContentExtractor import * + + + +# Calculate the depth of the subtree expanded by the given 'node'. +# Attributes: node = node, of which subtree is investigated +def getDepth(node): + if node.children == []: + return 0 + else: + depths = [] + for child in node.children: + depths.append(getDepth(child)) + return 1 if len(depths) == 0 else 1 + max(depths) + + + +# Calculate the frequency (number of characters) of different styles in the node. +# Attributes: node = node, of which subtree is investigated; contentExtractor = content extraction method +def getFrequencyOfStyles(node, contentExtractor): + classes = dict() + if isinstance(node, TreeElement): + style = getStyle(node, node, contentExtractor) + noC = numberOfCharacters(node, contentExtractor) + classes[style] = noC + getStylesRec(classes, node, contentExtractor) + return classes + + + +# Recursive counterpart to getLowestCommonAncestorNodeOfStyle +# Attributes: classes = collector dictionary; node = investigated node in this recursion step; +# contentExtractor = content extraction method +def getStylesRec(classes, node, contentExtractor): + for child in node.children: + style = getStyle(child, node, contentExtractor) + #print(style + '\t' + str(c.text)) + if style in classes.keys(): + classes[style] = classes[style] + numberOfCharacters(child, contentExtractor) + else: + classes[style] = numberOfCharacters(child, contentExtractor) + getStylesRec(classes, child, contentExtractor) + return classes + + + +# Find lowest common ancestor of a given style covering at least 'threshold' (%) of all occurrences of the +# given style ('noOfCharacters') in the document (i.e. in tree expanded by 'rootNode'). +# Whenever there is no such node, extract a maximum subsequence of direct children to the body containing the 'style'. +# Attributes: rootNode = root node of whole content tree; style = most common style; noOfCharacters = number of total +# (valid) characters for the style; threshold = minimum coverage for main content node; contentExtractor = content +# extraction method +def getLowestCommonAncestorNodeOfStyle(rootNode, style, noOfCharacters, threshhold, contentExtractor): + resultList = [] + getLowestCommonAncestorNodeOfStyleRec(rootNode, style, noOfCharacters, 1, resultList, contentExtractor) + maxDepth = getDepth(rootNode) + # find common ancestor + for currentDepth in range(maxDepth, 0, -1): + for result in resultList: + if result[2] == currentDepth and result[1] > threshhold: + return result[0] + + # find maximum subsequence if no common ancestor + # TODO largest relevant character subsequence??? TODODODODOD + bitmap = [] + for child in rootNode.children: + styleDict = getFrequencyOfStyles(child, contentExtractor) + bitmap.append(True) if style in styleDict.keys() else bitmap.append(False) + maxSubSeq = findMaximumSubsequenceOfTrue(bitmap) + returnList = [] + for index in maxSubSeq: + returnList.append(rootNode.children[index]) + toReturn = TreeElement() + toReturn.children = returnList + toReturn.tag = 'body' + toReturn.xpath = '/html/body' + return toReturn + + + +# Recursive counterpart to getLowestCommonAncestorNodeOfStyle +# Attributes: node = node investigated in this recursion step; style = most common style; noOfCharacters = number of +# total (valid) characters for the style; depth = current depth in subtree expanded by root; resultList = collector +# list; contentExtractor = content extraction method +def getLowestCommonAncestorNodeOfStyleRec(node, style, noOfCharacters, depth, resultList, contentExtractor): + for child in node.children: + styleDict = getFrequencyOfStyles(child, contentExtractor) + if style in styleDict.keys(): + coverage = float(float(styleDict[style])/float(noOfCharacters)) + resultList.append((child, coverage, depth)) + #if coverage > 0.51: + #print(str(child.xpath) + '\t' + str(coverage)) + getLowestCommonAncestorNodeOfStyleRec(child, style, noOfCharacters, depth+1, resultList, contentExtractor) + + + +# Calculate the number of (valid, according to used content extractor) characters in a node. +# Attributes: node = node investigated; contentExtractor = content extraction method +def numberOfCharacters(node, contentExtractor): + sum = 0 + if contentExtractor == ContentExtractor.RenderedStyle or contentExtractor == ContentExtractor.NaiveStyle: + for child in node.children: + if isinstance(child, TextElement) and child.text is not None: + sum = sum + len(child.text) + elif contentExtractor == ContentExtractor.NaiveStyleAndShortTextExclusion or ContentExtractor.RenderedStyleAndShortTextExclusion: + for child in node.children: + if isinstance(child, TextElement) and child.text is not None and len(child.text.split()) > 3: + sum = sum + len(child.text) + return sum + + + +# Extract style of node according to chosen method. +# Attributes: node = node investigated; parent = parent of 'node'; contentExtractor = content extraction method +def getStyle(node, parent, contentExtractor): + if contentExtractor == ContentExtractor.NaiveStyle or contentExtractor == ContentExtractor.NaiveStyleAndShortTextExclusion: + if isinstance(node, TextElement): + return str(parent.tag) + '$' + str(parent.attributes) + else: + return str(node.tag) + '$' + str(node.attributes) + elif contentExtractor == ContentExtractor.RenderedStyle or contentExtractor == ContentExtractor.RenderedStyleAndShortTextExclusion: + if isinstance(node, TextElement): + return parent.style + else: + return node.style + else: + return None + + + +# Return list of all tags + '$' not rendered in HTML documents. +# These tags do not have to be included in the process of parsing. +def getListOfUnrederedTags(): + return ['noframes$', 'audio$', 'canvas$', 'script$', 'noscript$', 'datalist$', 'embed$', 'meter$', 'progress$', + 'template$', 'video$', 'wbr$', 'area$', 'col$', 'iframe$', 'img$', 'input$', 'link$', 'meta$', + 'optgroup$', 'option$', 'param$', 'select$', 'style$', 'applet$', 'title$', 'body$', 'head$', 'center$', + 'frame$', 'frameset$', 'svg$'] + + + +# Search for the range with the maximum subsequence of 'True' in a bitmap. +# Attributes: list = bitmap to be investigated +def findMaximumSubsequenceOfTrue(list): + currentLength = 0; + currentStart = 0; + currentBest = range(0, 0) + for i in range(0, len(list)): + if list[i]: + currentLength += 1 + if(len(currentBest) < currentLength): + currentBest = range(currentStart, currentStart + currentLength) + else: + currentStart = i + 1 + currentLength = 0 + return currentBest + + + +# Determine the number of child nodes. +# Method used for statistical purposes. +# Attributes: node = node = node investigated +def numberOfChildNodes(node): + sum = 0 + for child in node.children: + if isinstance(child, TreeElement): + sum = sum + 1 + numberOfChildNodes(child) + return sum diff --git a/src/DOMParser/DOMNode.py b/src/DOMParser/DOMNode.py new file mode 100644 index 0000000..6ea67d3 --- /dev/null +++ b/src/DOMParser/DOMNode.py @@ -0,0 +1,35 @@ +class DOMNode: + def __init__(self): + self.children = [] + + # Extract the whole content of all elements in the subtree induced by this DOMNode + def getWholeContent(self): + toReturn = [] + for c in self.children: + if isinstance(c, TreeElement): + toReturn.append(c.getWholeContent()) + elif isinstance(c, TextElement): + toReturn.append(c.text) + return '\n'.join(toReturn) + + +# TreeElement used as a container to structure DOM-tree. +# TreeElements contain style information aplicaple to all direct children TextElements. +class TreeElement(DOMNode): + def __init__(self): + DOMNode.__init__(self) + self.tag = '' + self.attributes = dict() + self.style = '' + self.xpath = '' + + +# TextElement holds text sequences as direct child of TreeElement. +# TextElements do _not_ hold any children! +class TextElement(DOMNode): + def __init__(self, text): + DOMNode.__init__(self) + self.text = text + + + diff --git a/src/DOMParser/DOMParser.py b/src/DOMParser/DOMParser.py new file mode 100644 index 0000000..f1daa7b --- /dev/null +++ b/src/DOMParser/DOMParser.py @@ -0,0 +1,165 @@ +from src.DOMParser.Font import Font +from src.DOMParser.DOMNode import TreeElement, TextElement +from io import StringIO +from lxml import etree + + + +# Parse the HTML doc downloaded by Selenium. +# Attributes: driver = Selenium driver; extractStyle = whether style is rendered in this step +def parseTree(driver, extractStyle): + parser = etree.HTMLParser() + tree = etree.parse(StringIO(driver.page_source), parser) + root = tree + body = tree.xpath('/html/body')[0] + xpath = tree.getpath(body) + toReturn = TreeElement() + toReturn.tag = body.tag + toReturn.xpath = xpath + if extractStyle: + toReturn.style = parseStyle(driver, xpath) + toReturn.children = parseSeleniumTreeRec(driver, root, body.xpath('child::node()'), extractStyle) + return toReturn + + + +# Recursive counterpart to parseSeleniumTree +# Attributes: driver = Selenium driver; root = root HTML element; currentElem = node investigated in this recursion +# step; extractStyle = whether style is rendered +def parseSeleniumTreeRec(driver, root, currentElem, extractStyle): + childList = [] + for child in currentElem: + if isinstance(child, str): + cutted = cutStrToVisibleContent(child) + childList.append(TextElement(cutted)) + elif not ((str(child.tag) + '$') in getListOfUnrederedTags()) and not ('Comment' in str(child.tag)): + treeElement = TreeElement() + xpath = root.getpath(child) + treeElement.attributes = child.attrib + treeElement.tag = child.tag + treeElement.xpath = xpath + if extractStyle: + treeElement.style = parseStyle(driver, xpath) + treeElement.children = parseSeleniumTreeRec(driver, root, child.xpath('child::node()'), extractStyle) + childList.append(treeElement) + else: + None + return childList + + + +# Parse style of given element. +# Attributes: driver = Selenium driver; xpath = XPath to currently investigated element +def parseStyle(driver, xpath): + try: + elem = driver.find_element_by_xpath(xpath) + sizeStr = str(elem.value_of_css_property('font-size')) + size = float(sizeStr[:(len(sizeStr) - 2)]) + style = Font(\ + size,\ + int(elem.value_of_css_property('font-weight')),\ + True if 'underline' in str(elem.value_of_css_property('text-decoration')).lower() else False,\ + str(elem.value_of_css_property('font-family'))\ + ) + return style + except: + # FALLBACK for failed style extraction + return Font(1, 300, False, 'undefined') + + + +# Cut unimportant characters from text. +# Attributes: text = input string +def cutStrToVisibleContent(text): + toReturn = [] + for c in text: + if ord(c) == 10 or ord(c) >= 32: + toReturn.append(c) + empty = "" + return cutStartAndEndWhitespaces(empty.join(toReturn).replace('\n', ' ')) + + + +# Cut whitespaces at begin and end from text. +# Attributes: text = input string +def cutStartAndEndWhitespaces(text): + firstCharStart = 0 + lastCharEnd = (len(text) - 1) + for i in range(0, len(text)): + if ord(text[i]) != ord(' '): + firstCharStart = i + break + for i in range(len(text) - 1, -1, -1): + if ord(text[i]) != ord(' '): + lastCharEnd = i + break + return text[firstCharStart:(lastCharEnd + 1)] + + + +# Return list of all tags + '$' not rendered in HTML documents. +# These tags do not have to be included in the process of parsing. +def getListOfUnrederedTags(): + return ['noframes$', 'audio$', 'canvas$', 'script$', 'noscript$', 'datalist$', 'embed$', 'meter$', 'progress$', + 'template$', 'video$', 'wbr$', 'area$', 'col$', 'iframe$', 'img$', 'input$', 'link$', 'meta$', + 'optgroup$', 'option$', 'param$', 'select$', 'style$', 'applet$', 'title$', 'body$', 'head$', 'center$', + 'frame$', 'frameset$', 'svg$'] + + + +# Extract all style information for a subtree expanded by node. +# Attributes: driver = Selenium driver, node = node, of which subtree is processed +def extractStyleForSubtreeRec(driver, node): + childList = [] + for child in node.children: + if isinstance(child, TreeElement): + te = TreeElement() + if not hasOnlyTreeChildren(child): + te.style = parseStyle(driver, child.xpath) + te.attributes = child.attributes + te.tag = child.tag + te.children = extractStyleForSubtreeRec(driver, child) + childList.append(te) + elif isinstance(child, TextElement): + childList.append(child) + return childList + + + +# Removes empty nodes. +# Attributes: node = node, of which subtree is processed +def cleanMainContent(node): + newChildren = [] + for child in node.children: + if isinstance(child, TextElement) and hasNoContent(child): + None + elif isinstance(child, TreeElement): + child = cleanMainContent(child) + newChildren.append(child) + else: + newChildren.append(child) + node.children = newChildren + return node + + + +# Checks, whether a TextElement node has no content +# Attributes: textNode = TextElement which is investigated +def hasNoContent(textNode): + toReturn = True + for c in textNode.text: + if ord(c) > 32: + toReturn = False + return toReturn + + + +# Checks, whether a nodes children are all TreeElements and do not hold a single TextElement. +# Attributes: node = node, of which subtree is investigated +def hasOnlyTreeChildren(node): + toReturn = True + for child in node.children: + if isinstance(child, TextElement): + toReturn = False + return toReturn + diff --git a/src/DOMParser/Font.py b/src/DOMParser/Font.py new file mode 100644 index 0000000..aad5c75 --- /dev/null +++ b/src/DOMParser/Font.py @@ -0,0 +1,29 @@ +#from functools import total_ordering + + +#@total_ordering +class Font: + + def __init__(self, size, weight, underlined, fontFamily): + self.weight = weight + self.isUnderlined = underlined + self.fontSize = size + self.fontFamily = fontFamily + + def __str__(self): + string = "" + return string + + def __eq__(self, other): + if other is None: + return False + return self.weight == other.weight and self.isUnderlined == other.isUnderlined \ + and self.fontFamily == other.fontFamily and self.fontSize == other.fontSize + + def __hash__(self): + return hash(self.weight) ^ hash(self.isUnderlined) ^ hash(self.fontSize) ^ hash(self.fontFamily) + diff --git a/src/Downloader/Downloader.py b/src/Downloader/Downloader.py new file mode 100644 index 0000000..1ed7172 --- /dev/null +++ b/src/Downloader/Downloader.py @@ -0,0 +1,36 @@ +from selenium import webdriver +from src.DOMParser.DOMParser import parseTree, parseStyle, extractStyleForSubtreeRec + + +# Download and parse DOM-tree for a website. +# Attributes: link = url to the website; getStyle = whether css rendered style should be parsed in this step; +# driver = Selenium driver +def getDOMTree(link, getStyle, driver): + close = False + if driver is None: + driver = webdriver.Chrome(executable_path='../../chromedriver') + close = True + driver.get(link) + body = parseTree(driver, getStyle) + title = driver.title + if close: + driver.close() + return (body, title) + + + +# Extracts CSS style information for a given subtree. +# Attributes: url = url to the website; mainContentNode = node, which subtree is processed; +# driver = Selenium driver +def extractStyleForSubtree(url, mainContentDOMNode, driver=None): + close = False + if driver is None: + driver = webdriver.Chrome(executable_path='../../chromedriver') + close = True + + driver.get(url) + mainContentDOMNode.style = parseStyle(driver, mainContentDOMNode.xpath) + mainContentDOMNode.children = extractStyleForSubtreeRec(driver, mainContentDOMNode) + if close: + driver.close() + return mainContentDOMNode diff --git a/src/HierarchyExtractor/Block.py b/src/HierarchyExtractor/Block.py new file mode 100644 index 0000000..19cfed7 --- /dev/null +++ b/src/HierarchyExtractor/Block.py @@ -0,0 +1,77 @@ +from enum import Enum +from src.DOMParser.Font import Font +from src.HierarchyExtractor.RomanNumber import RomanNumber +import re + +class EnumerationType(Enum): + Numeric = 1 + Roman = 2 + Alphabetic = 3 + List = 4 + +# Retrieve numeration from given text. +def getNumeration(txt): + # Looking for possible numerations in text. + possibleNums = re.search("\s[\(§]?(([IVXLivxl]{1,7})|([0-9]{1,2})|[a-zA-Z])([\.\-,:](([IVXLivxl]{1,7})|([0-9]{1,2})|[a-zA-Z]))*[\-:\.)]?\s", ' ' + txt + ' ') + if possibleNums is None: + return [] + + else: + # Normalizing different + firstNum = possibleNums[0] + firstNum = firstNum[:-1] + firstNum = firstNum.replace('-', '.') + firstNum = firstNum.replace(',', '.') + firstNum = firstNum.replace(':', '.') + firstNum = firstNum.replace(')', '.') + firstNum = firstNum.replace('(', '') + firstNum = firstNum.replace('§', '') + firstNum = firstNum.replace(' ', '.') + firstNum = firstNum.replace('\xa0', '.') + + # Splitting string into different levels at dots. + nums = firstNum.split('.') + return translateNums(nums) + + + + +# Translate list of strings to integer list; supports decimal numbers, roman numbers & letters. +def translateNums(nums): + toReturn = [] + for n in nums: + if re.search("[IVXLivxl]{1,7}", n): + toReturn.append((RomanNumber(n).getValue(), EnumerationType.Roman)) + elif re.search("[0-9]{1,2}", n): + toReturn.append((int(n), EnumerationType.Numeric)) + elif re.search("[a-zA-Z]", n): + toReturn.append((ord(n.lower()) - 96, EnumerationType.Alphabetic)) + + + return toReturn + + + +# A paragraph meant for building a tree. +class Block: + + def __init__(self, text, style): + self.text = text + self.numeration = getNumeration(text[:10]) + self.style = Font(12, False, False, 'FAMILY') if style is None else style + + def getWholeContent(self): + if self.text is None: + return ' ' + else: + return self.text + + def __str__(self): + return str(self.text) + + def getNumerationPattern(self): + return list(map(lambda x: x[1], self.numeration)) + + def getNumerationNumeral(self): + return list(map(lambda x: x[0], self.numeration)) + diff --git a/src/HierarchyExtractor/BlockList.py b/src/HierarchyExtractor/BlockList.py new file mode 100644 index 0000000..41dfcd1 --- /dev/null +++ b/src/HierarchyExtractor/BlockList.py @@ -0,0 +1,118 @@ +from enum import Enum + +from src.DOMParser.DOMNode import TextElement, TreeElement +from src.HierarchyExtractor.Block import Block, EnumerationType + +class Type(Enum): + split = 0 + listStart = 1 + listEnd = 2 + + +class BlockList: + def __init__(self, mainContent): + tfList = getTFList(mainContent) + self.list = formListFromTF(tfList) + + +class TextFraction: + def __init__(self, text, style, tag): + self.text = text + self.style = style + self.tag = tag + + +def formListFromTF(tfList): + splitList = [] + counter = [0] + for i in range(0, len(tfList)): + if tfList[i] == Type.split: + splitList.append(i) + blockList = [] + for i in range(0, len(splitList) - 1): + blockList.append(formBlockTF(tfList[ splitList[i] + 1 : splitList[i+1] ], counter)) + + return list(filter(lambda x: not x is None, blockList)) + + +def formBlockTF(tfList, counter): + styles = dict() + totalText = '' + if len(tfList) > 0 and tfList[0] == Type.listStart and tfList[len(tfList) - 1] == Type.listEnd: + toReturn = formBlockTF(tfList[1:len(tfList) - 1], counter) + toReturn.numeration = [(counter[0], EnumerationType.List)] + counter[0] += 1 + return toReturn + + for entry in tfList: + if isinstance(entry, TextFraction): + if (str(entry.tag) + '$') != 'a$': + if entry.style in styles.keys(): + styles[entry.style] += len(entry.text) + else: + styles[entry.style] = len(entry.text) + elif (str(entry.tag) + '$') == 'a$': + if entry.style not in styles.keys(): + styles[entry.style] = 0 + + totalText += (' ' + entry.text) + + if len(totalText) > 2: + mcs = max(styles, key=styles.get) + return Block(totalText[1:], mcs) + else: + return None + + + +# Generate a list of TextFractions by traversing the tree (depth-first) and safe every content +# string to the TF list. +# Attributes: mainContent = root node of main content +def getTFList(mainContent): + toFill = [] + toFill.append(Type.split) + getTFListRec(mainContent, toFill) + toFill.append(Type.split) + return toFill + + + +# Generate a list of TextFractions by traversing the tree (depth-first) and safe every content +# string to the TF list (REC). +# Attributes: node = node to be processed; toFill = list of TextFractions +def getTFListRec(node, toFill): + if (str(node.tag) + '$') in getParagraphFormingTags(): + toFill.append(Type.split) + + for child in node.children: + if isinstance(child, TextElement): + toFill.append(TextFraction(child.text, node.style, node.tag)) + elif isinstance(child, TreeElement): + if (str(child.tag) + '$') == 'li$': + toFill.append(Type.split) + toFill.append(Type.listStart) + getTFListRec(child, toFill) + toFill.append(Type.listEnd) + toFill.append(Type.split) + else: + getTFListRec(child, toFill) + toFill.append(Type.split) + elif (str(node.tag) + '$') == 'br$': + toFill.append(Type.split) + else: + for child in node.children: + if isinstance(child, TextElement): + toFill.append(TextFraction(child.text, node.style, node.tag)) + elif isinstance(child, TreeElement): + getTFListRec(child, toFill) + + + +# Return list of all tags + '$' rendered as a paragraph in HTML documents. +# These tags trigger a 'split' during the formation of a TF list. +def getParagraphFormingTags(): + return ['article$', 'section$', 'nav$', 'aside$', 'h1$', 'h2$', 'h3$', 'h4$', 'h5$', 'h6$', 'hgroup$', 'header$', + 'footer$', 'address$', 'p$', 'pre$', 'blockquote', 'ol$', 'ul$', 'menu$', 'li$', 'dl$', 'dt$', 'dt$', 'dd$', + 'figure$', 'figcaption$', 'main$', 'div$', 'summary$', 'td$', 'th$', 'caption$', 'legend$', 'form$', + 'fieldset$', 'details$'] + diff --git a/src/HierarchyExtractor/BlockNode.py b/src/HierarchyExtractor/BlockNode.py new file mode 100644 index 0000000..dfbd9eb --- /dev/null +++ b/src/HierarchyExtractor/BlockNode.py @@ -0,0 +1,32 @@ +from src.HierarchyExtractor.Block import Block + + +class BlockNode: + def __init__(self): + self.children = [] + self.headline = None + + def getWholeContent(self): + text = '' + if self.headline is not None: + text += self.headline.text + for child in self.children: + text += child.getWholeContent() + + return text + + +def printTree(node, depth): + indent = ' ' + inFurther = '>' + for i in range(0, depth): + indent += inFurther + indent += ' ' + + + if isinstance(node, BlockNode): + print(indent + str(node.headline)) + for elem in node.children: + printTree(elem, depth + 1) + elif isinstance(node, Block): + print(indent + '\t' + node.text) diff --git a/src/HierarchyExtractor/EnumerationHierarchyExtractor.py b/src/HierarchyExtractor/EnumerationHierarchyExtractor.py new file mode 100644 index 0000000..6149328 --- /dev/null +++ b/src/HierarchyExtractor/EnumerationHierarchyExtractor.py @@ -0,0 +1,204 @@ +from src.HierarchyExtractor.Block import Block, EnumerationType +from src.HierarchyExtractor.BlockNode import BlockNode + + + +# Façade for different hierarchy extraction approaches based on enumeration patterns. +# Attributes: rootNode = rootNode of the visually separated hierarchy tree +def extractHierarchyNumerically(rootNode): + separateBlocksNums(rootNode) + validateBlocksNums(rootNode) + adjustListNums(rootNode) + + + +# Apply enumeration patterns for lists. +# Attributes: rootNode = rootNode of the visually and hierarchically separated hierarchy tree +def adjustListNums(rootNode): + if isinstance(rootNode, BlockNode): + sepList = [] + sepList.append(0) + for i in range(0, len(rootNode.children)): + + if isinstance(rootNode.children[i], BlockNode): + adjustListNums(rootNode.children[i]) + else: + if EnumerationType.List in rootNode.children[i].getNumerationPattern(): + sepList.append(i) + elif i - 1 == sepList[len(sepList) - 1] and sepList[len(sepList) - 1] != 0: + sepList.append(i) + + + if len(sepList) > 1: + sepList.append(sepList[len(sepList)-1]+1) + sepList.append(len(rootNode.children)) + newChildren = [] + for i in range(0, len(sepList) - 1): + toAppend = BlockNode() + toAppend.headline = Block('', None) + toAppend.children = rootNode.children[sepList[i]:sepList[i+1]] + newChildren.append(toAppend) + + rootNode.children = newChildren + + + +# Check the textual content of a block for occuring enumeration patterns. +# Attributes: node = node, which content should be checked +def separateBlocksNums(node): + relevantBlocks = list(filter(lambda x: isinstance(x, Block), node.children)) + otherBlockNodes = list(filter(lambda x: isinstance(x, BlockNode), node.children)) + headlineStyles = getAllStyles(relevantBlocks) + for style in headlineStyles: + isValidNumStyleRes = isValidNumStyle(style, relevantBlocks) + if isValidNumStyleRes[0]: + headlineList = [] + for i in range(0, len(relevantBlocks)): + if isinstance(relevantBlocks[i], Block) and relevantBlocks[i].getNumerationPattern() == style[0]: + headlineList.append(i) + if isValidNumerationPattern(relevantBlocks, headlineList): + headlineList.append(len(relevantBlocks)) + + newChildren = relevantBlocks[0:headlineList[0]] + for i in range(0, len(headlineList) - 1): + toAppend = BlockNode() + if isValidNumStyleRes[1]: + # no headline + toAppend.headline = Block('', None) + toAppend.children = relevantBlocks[(headlineList[i]):headlineList[i+1]] + else: + # with headline + toAppend.headline = relevantBlocks[headlineList[i]] + toAppend.children = separateBlocksNumsRec(relevantBlocks[(headlineList[i] + 1):headlineList[i+1]]) + newChildren.append(toAppend) + newChildren += otherBlockNodes + node.children = newChildren + break + + for child in node.children: + if isinstance(child, BlockNode): + separateBlocksNums(child) + + + +# Check the textual content of a block for occuring enumeration patterns (REC). +# Attributes: blockList = block list of textual content to be checked +def separateBlocksNumsRec(blockList): + headlineStyles = getAllStyles(blockList) + for style in headlineStyles: + isValidNumStyleRes = isValidNumStyle(style, blockList) + if isValidNumStyleRes[0]: + headlineList = [] + for i in range(0, len(blockList)): + if isinstance(blockList[i], Block) and blockList[i].getNumerationPattern() == style[0]: + headlineList.append(i) + if isValidNumerationPattern(blockList, headlineList): + headlineList.append(len(blockList)) + + newChildren = blockList[0:headlineList[0]] + for i in range(0, len(headlineList) - 1): + toAppend = BlockNode() + if isValidNumStyleRes[1]: + # no headline + toAppend.headline = Block('', None) + toAppend.children = blockList[(headlineList[i]):headlineList[i+1]] + else: + # with headline + toAppend.headline = blockList[headlineList[i]] + toAppend.children = separateBlocksNumsRec(blockList[(headlineList[i] + 1):headlineList[i+1]]) + newChildren.append(toAppend) + return newChildren + return blockList + + + +# Check if a numeration pattern is occurring at least twice in a list of blocks and if blocks contain a headline. +# Attributes: style = numeration patter; nodeChildren = list of blocks +def isValidNumStyle(style, nodeChildren): + if EnumerationType.List in style[0]: + return (False, False) + if len(list(filter(lambda block: block.getNumerationPattern() == style[0],\ + list(filter(lambda child: isinstance(child, Block), nodeChildren))))) <= 1: + return (False, False) + + if len(nodeChildren[style[1]].text.split()) >= 10: + return (True, True) + elif style[1] == len(list(filter(lambda child: isinstance(child, Block), nodeChildren))) - 1: + return (nodeChildren[style[1]].getNumerationPattern() != nodeChildren[style[1] - 1].getNumerationPattern(), False) + else: + return (nodeChildren[style[1]].getNumerationPattern() != nodeChildren[style[1] + 1].getNumerationPattern(), False) + + + +# Gather all enumeration patterns whithin a list of blocks. +# Attributes: nodeChildren = list of blocks +def getAllStyles(nodeChildren): + toReturn = [] + for i in range(0, len(nodeChildren)): + if isinstance(nodeChildren[i], Block) and len(nodeChildren[i].numeration) >= 1: + toReturn.append((nodeChildren[i].getNumerationPattern(), i)) + return toReturn + + + +# Check if a numeration pattern is valid in a list of blocks. +# Attributes: nodeList = list of blocks; headlineList = indexes of all occurring headlines +def isValidNumerationPattern(nodeList, headlineList): + if len(headlineList) >= 10: + return True + for i in range(1, len(headlineList)): + if isinstance(nodeList[headlineList[i]], Block): + if not isValidStep(nodeList[headlineList[i-1]].numeration, nodeList[headlineList[i]].numeration): + return False + else: + if not isValidStep(nodeList[headlineList[i-1]].headline.numeration, nodeList[headlineList[i]].headline.numeration): + return False + return True + + + +# Check if a step within a numeration pattern is valid. +# Attributes: num1 = first numeration pattern; num2 = second numeration pattern +def isValidStep(num1, num2): + sumNumbers1 = sum(list(map(lambda x: x[0], num1))) + sumNumbers2 = sum(list(map(lambda x: x[0], num2))) + return sumNumbers2 - sumNumbers1 <= 2 and sumNumbers2 - sumNumbers1 >= 1 + + + +# Check all headlines for existing enumeration patterns (including those detected visually-based). +# Attributes: node = node, which headline should be checked +def validateBlocksNums(node): + headlineStyles = list(map(lambda y: list(map(lambda x: x[1], y.headline.numeration)), list(filter(lambda x: isinstance(x, BlockNode) and (not x.headline is None), node.children)))) + for headlineStyle in headlineStyles: + headlineList = [] + for i in range(0, len(node.children)): + if isinstance(node.children[i], BlockNode) and \ + list(map(lambda x: x[1], node.children[i].headline.numeration)) == headlineStyle: + headlineList.append(i) + + if isValidNumerationPattern(node.children, headlineList) and \ + len(headlineList) >= 2 and len(list(filter(lambda x: isinstance(x, BlockNode), node.children))) != len(headlineList): + newChildren = node.children[0:headlineList[0]] + for i in range(0, len(headlineList)): + toAppend = BlockNode() + toAppend.headline = node.children[headlineList[i]].headline + toAppend.children = node.children[headlineList[i]].children + if i == len(headlineList) - 1: + toAppend.children += node.children[(headlineList[i] + 1):len(node.children)] + else: + toAppend.children += node.children[(headlineList[i] + 1):headlineList[i+1]] + + newChildren.append(toAppend) + + node.children = newChildren + break + + + for child in node.children: + if isinstance(child, BlockNode): + validateBlocksNums(child) + + + + diff --git a/src/HierarchyExtractor/HierarchyExtractor.py b/src/HierarchyExtractor/HierarchyExtractor.py new file mode 100644 index 0000000..6bcf2c5 --- /dev/null +++ b/src/HierarchyExtractor/HierarchyExtractor.py @@ -0,0 +1,25 @@ +import src.ContentExtractor.ContentExtractorTypes +from src.ContentExtractor import ContentExtractor +from src.ContentExtractor.TreeUtilities import getFrequencyOfStyles +from src.HierarchyExtractor.EnumerationHierarchyExtractor import extractHierarchyNumerically +from src.HierarchyExtractor.VisualStyleHierarchyExtractor import extractHierarchyVisually + + +# Façade for different hierarchy extraction approaches. +# Attributes: mainContentDOMNode = node holding the main content; contentExtractor = content extractor type +def extractHierarchy(mainContentDOMNode, contentExtractor): + defaultStyle = None + + # MCS is needed to determine non-headline style + if contentExtractor is src.ContentExtractor.ContentExtractorTypes.ContentExtractor.NaiveStyleAndShortTextExclusion \ + or src.ContentExtractor.ContentExtractorTypes.ContentExtractor.RenderedStyleAndShortTextExclusion: + dic = getFrequencyOfStyles(mainContentDOMNode, src.ContentExtractor.ContentExtractorTypes.ContentExtractor.RenderedStyleAndShortTextExclusion) + defaultStyle = max(dic, key=dic.get) + elif contentExtractor is src.ContentExtractor.ContentExtractorTypes.ContentExtractor.RenderedStyle or src.ContentExtractor.ContentExtractorTypes.ContentExtractor.NaiveStyle: + dic = getFrequencyOfStyles(mainContentDOMNode, src.ContentExtractor.ContentExtractorTypes.ContentExtractor.RenderedStyle) + defaultStyle = max(dic, key=dic.get) + + + hierarchyTree = extractHierarchyVisually(mainContentDOMNode, defaultStyle) + extractHierarchyNumerically(hierarchyTree) + return hierarchyTree diff --git a/src/HierarchyExtractor/RomanNumber.py b/src/HierarchyExtractor/RomanNumber.py new file mode 100644 index 0000000..bd67295 --- /dev/null +++ b/src/HierarchyExtractor/RomanNumber.py @@ -0,0 +1,39 @@ +class RomanNumber: + + + + def __init__(self, num): + self.num = num.upper() + self.valid = True + self.resolve = { + 'I' : 1, + 'V' : 5, + 'X' : 10, + 'L' : 50, + } + for c in self.num: + if c not in self.resolve.keys(): + self.valid = False + + + # Get integer decimal value of a roman number. + def getValue(self): + if not self.valid: + return -1 + else: + reversed = self.num[::-1] + indexList = ['I', 'V', 'X', 'L'] + lastIndex = 0 + sum = 0 + for c in reversed: + if indexList.index(c) < lastIndex: + sum -= self.resolve[c] + else: + lastIndex = indexList.index(c) + sum += self.resolve[c] + return sum + + + def isValid(self): + return self.valid + diff --git a/src/HierarchyExtractor/VisualStyleHierarchyExtractor.py b/src/HierarchyExtractor/VisualStyleHierarchyExtractor.py new file mode 100644 index 0000000..790c501 --- /dev/null +++ b/src/HierarchyExtractor/VisualStyleHierarchyExtractor.py @@ -0,0 +1,93 @@ +from src.DOMParser.Font import Font +from src.HierarchyExtractor.BlockList import BlockList +from src.HierarchyExtractor.BlockNode import BlockNode + + +# Visual-based hierarchy extraction. +# Attributes: mainContent = main content dom node; defaultStyle = MCS not regarded as a possible headline +def extractHierarchyVisually(mainContent, defaultStyle): + blockList = BlockList(mainContent) + #printBlockList(blockList) + hierarchyTree = separateBlocks(blockList, defaultStyle) + return hierarchyTree + + + +# Print a list of blocks and its associated information (DEBUGGING). +# Attributes: blockList = list of blocks +def printBlockList(blockList): + for entry in blockList.list: + print(str(entry.style) + '\t' + str(entry.numeration) + '\t' + entry.text) + + + +# Separate blocks using a visual-based hierarchy extraction. +# Attributes: blockList = list of blocks; defaultStyle = MCS +def separateBlocks(blockList, defaultStyle): + toReturn = BlockNode() + toReturn.children = separateBlocksRec(blockList.list, defaultStyle) + return toReturn + + + +# Separate blocks using a visual-based hierarchy extraction (REC). +# Attributes: blockList = list of blocks; defaultStyle = MCS +def separateBlocksRec(nodeList, defaultStyle): + + # find next headline style + headlineStyle = findNextHeadlineStyle(nodeList, defaultStyle) + if headlineStyle is None: + return nodeList + + else: + headLineList = [] + # create list of all headline style occurrences + for i in range(0, len(nodeList)): + if nodeList[i].style == headlineStyle: + headLineList.append(i) + + # save all blocks before the headline as content to the current node + childListToReturn = [] + if headLineList != []: + childListToReturn = nodeList[0:headLineList[0]] + + # split the block list according to headline style occurrences and recursively process the + # content in between headlines + for i in range(0, len(headLineList)): + toAppend = BlockNode() + toAppend.headline = nodeList[headLineList[i]] + if i == len(headLineList) - 1: + toAppend.children = separateBlocksRec(nodeList[(headLineList[i] + 1):len(nodeList)], defaultStyle) + else: + toAppend.children = separateBlocksRec(nodeList[(headLineList[i] + 1):headLineList[i+1]], defaultStyle) + childListToReturn.append(toAppend) + + return childListToReturn + + + +# Determines whether a style is more prominent than the MCS. +# Attributes: style = currently investigated style; defaultStyle = MCS +def isMoreProminent(style, defaultStyle): + if style.isUnderlined or style.weight > defaultStyle.weight: + return True + else: + return style.fontSize > defaultStyle.fontSize + + + +# Searches for the next headline style. +# Attributes: list = block list; defaultStyle = MCS +def findNextHeadlineStyle(list, defaultStyle): + for elem in list: + if (elem.style != defaultStyle and elem.style != Font(1, 300, False, 'undefined')) \ + and (len(elem.text.split())) <= 10 and isMoreProminent(elem.style, defaultStyle): + return elem.style + return None + + + + + + + diff --git a/src/SentenceSegmentation/LanguageClassification.py b/src/SentenceSegmentation/LanguageClassification.py new file mode 100644 index 0000000..c4d3ace --- /dev/null +++ b/src/SentenceSegmentation/LanguageClassification.py @@ -0,0 +1,23 @@ +from enum import Enum +import langid + +class Lang(Enum): + de = 0 + en = 1 + + +# Classify language (en/de) of a text. +# Attributes: txt = input text as string +def classifyLanguage(txt): + langid.set_languages(['de','en']) + res = langid.classify(txt) + return Lang(0) if res[0] == 'de' else Lang(1) + + + +# Classify language (en/de) for a hierarchy tree by transforming it into a string and classify +# its language. +# Attributes: hierarchyTree = tree, for which the language is determined. +def getLang(hierarchyTree): + mainContentText = hierarchyTree.getWholeContent() + return classifyLanguage(mainContentText) diff --git a/src/SentenceSegmentation/Segmenter.py b/src/SentenceSegmentation/Segmenter.py new file mode 100644 index 0000000..a486ad9 --- /dev/null +++ b/src/SentenceSegmentation/Segmenter.py @@ -0,0 +1,47 @@ +from somajo import SoMaJo +from src.SentenceSegmentation.LanguageClassification import Lang + + + +# Segment and tokenize a German string. +# Attributes: input = string to be tokenized +def getTokensFromStringDe(input): + tokenizer = SoMaJo("de_CMC", split_sentences=True) + toReturn = [] + sentences = tokenizer.tokenize_text([input]) + for sentence in sentences: + senteceList = [] + for word in sentence: + senteceList.append(word.text) + toReturn.append(senteceList) + return toReturn + + + +# Segment and tokenize an English string. +# Attributes: input = string to be tokenized +def getTokensFromStringEn(input): + tokenizer = SoMaJo("en_PTB", split_sentences=True) + toReturn = [] + sentences = tokenizer.tokenize_text([input]) + for sentence in sentences: + senteceList = [] + for word in sentence: + senteceList.append(word.text) + toReturn.append(senteceList) + return toReturn + + + +# Segment and tokenize a tree. +# Attributes: tree = node, which subtree will be processed; lang = language of content (en/de) +def segmentSentences(tree, lang): + for subsection in tree: + if lang is Lang.de: + subsection.text = getTokensFromStringDe(subsection.text) + elif lang is Lang.en: + subsection.text = getTokensFromStringEn(subsection.text) + segmentSentences(subsection.subsections, lang) + + + diff --git a/src/StructuredLegalExtraction.py b/src/StructuredLegalExtraction.py new file mode 100644 index 0000000..1fc5b55 --- /dev/null +++ b/src/StructuredLegalExtraction.py @@ -0,0 +1,69 @@ +from enum import Enum + +from selenium import webdriver + +import src.ContentExtractor.ContentExtractorTypes +from src.ContentExtractor import ContentExtractor +from src.ContentExtractor.ContentExtractor import getMainContent +from src.ContentExtractor.TreeUtilities import getFrequencyOfStyles +from src.Downloader.Downloader import getDOMTree, extractStyleForSubtree +from src.HierarchyExtractor.HierarchyExtractor import extractHierarchy +from src.TargetStructure.TargetStructure import generateTargetStructure + + +def extractTandC(url, contentExtractor=src.ContentExtractor.ContentExtractorTypes.ContentExtractor.NaiveStyleAndShortTextExclusion, + threshold=0.85, driver=None): + + # Check for legality of threshold. + if not threshold > 0.5: + print('Threshold must be above 0.5!') + return None + + # Use Downloader component: + extractStyle = False + if contentExtractor is src.ContentExtractor.ContentExtractorTypes.ContentExtractor.RenderedStyle \ + or contentExtractor is src.ContentExtractor.ContentExtractorTypes.ContentExtractor.RenderedStyleAndShortTextExclusion: + extractStyle = True + website = getDOMTree(url, extractStyle, driver) + title = website[1] + bodyNode = website[0] + + + # Use Content Extractor component: + mainContent = getMainContent(bodyNode, contentExtractor, threshold) + + + # Add styling if this did not happen before. + if not extractStyle: + extractStyleForSubtree(url, mainContent, driver) + + hierarchyTree = extractHierarchy(mainContent, contentExtractor) + + toReturn = generateTargetStructure(hierarchyTree, url, title) + return toReturn + + +#def extractTandC_content(link, driver=None): +# website = getDOMTree(link, False, driver) +# bodyNode = website[0] +# dic = getFrequencyOfStyles(bodyNode, src.ContentExtractor.ContentExtractorTypes.ContentExtractor.NaiveStyleAndShortTextExclusion) +# mainContent = getMainContent(bodyNode, src.ContentExtractor.ContentExtractorTypes.ContentExtractor.NaiveStyleAndShortTextExclusion, 0.85) +# return (mainContent.getWholeContent(), bodyNode.getWholeContent(), dic, mainContent) + + + +# Extract multiple T&Cs with the same driver and return result list. +# Attributes: links = list of links as string; contentExtractor = content extraction method; threshold = minimum +# coverage for main content node; driver = Selenium driver +def extractTandD_multiple(links, contentExtractor=src.ContentExtractor.ContentExtractorTypes.ContentExtractor.NaiveStyleAndShortTextExclusion, + threshold=0.85, driver=None): + close = False + if driver is None: + close = True + driver = webdriver.Chrome(executable_path='../chromedriver') + results = [] + for link in links: + results.append(extractTandC(link, contentExtractor, threshold, driver)) + if close: + driver.close() + return results diff --git a/src/TargetStructure/Document.py b/src/TargetStructure/Document.py new file mode 100644 index 0000000..3e07ade --- /dev/null +++ b/src/TargetStructure/Document.py @@ -0,0 +1,21 @@ +from src.TargetStructure.Section import Section +import json +import datetime + +class Document: + + def __init__(self, title, source): + now = datetime.datetime.now() + self.extractionDate = (now.microsecond, now.second, now.minute, now.hour, now.day, now.month, now.year) + self.source = source + self.title = title + self.content = None + self.id = abs(int(hash(self.extractionDate) ^ hash(self.source))) + + + def getExtraction(self): + return json.dumps(self, default=lambda o: o.__dict__, + sort_keys=True, indent=4) + + + diff --git a/src/TargetStructure/Section.py b/src/TargetStructure/Section.py new file mode 100644 index 0000000..b8e7729 --- /dev/null +++ b/src/TargetStructure/Section.py @@ -0,0 +1,37 @@ +import json + +from src.HierarchyExtractor.Block import Block +from src.HierarchyExtractor.BlockNode import BlockNode + + + +class Section: + + def __init__(self, title, text, subs): + self.subsections = subs + self.text = text + self.title = title + + def getExtraction(self): + return json.dumps(self, default=lambda o: o.__dict__, + sort_keys=True, indent=4) + + +# Parse hierarchy tree into sections. +# Attributes: tree = node of hierarchy tree +def parseToSections(tree): + text = '' + for child in tree.children: + if isinstance(child, Block): + text += ('\n' + child.text) + + subs = [] + for child in tree.children: + if isinstance(child, BlockNode): + subs.append(parseToSections(child)) + + if tree.headline is not None: + toReturn = Section(tree.headline.text, text[1:], subs) + else: + toReturn = Section('', text[1:], subs) + return toReturn diff --git a/src/TargetStructure/TargetStructure.py b/src/TargetStructure/TargetStructure.py new file mode 100644 index 0000000..758d6f1 --- /dev/null +++ b/src/TargetStructure/TargetStructure.py @@ -0,0 +1,23 @@ +from src.SentenceSegmentation.LanguageClassification import getLang +from src.SentenceSegmentation.Segmenter import segmentSentences +from src.TargetStructure.Document import Document +from src.TargetStructure.Section import parseToSections, Section + + +# Generates the JSON target structure. +# Attributes: hierarchyTree = root node of hierarchy tree; link = url to website; +# title = the websites title +def generateTargetStructure(hierarchyTree, link, title): + language = getLang(hierarchyTree) + + document = Document(title, link) + parseResult = parseToSections(hierarchyTree) + if parseResult.text == '': + sections = parseResult.subsections + else: + sections = [Section('', parseResult.text, [])] + parseResult.subsections + segmentSentences(sections, language) + document.content = sections + + toReturn = document.getExtraction() + return toReturn