v 1.0

sebischair · May 9, 2022 · 6ac9be0 · 6ac9be0
commit 6ac9be0
Show file tree

Hide file tree

Showing 24 changed files with 1,543 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
diff --git a/Pipfile b/Pipfile
@@ -0,0 +1,12 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+requests-html = "*"
+
+[dev-packages]
+
+[requires]
+python_version = "3.9"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -0,0 +1,29 @@
+# LowestCommonAncestorExtractor
+
+A python library for the structured extraction of content from German and English Terms and Conditions. Developed by [Tobias Schamel](https://wwwmatthes.in.tum.de/pages/665u6pdbc45i/Bachelor-s-Thesis-Tobias-Schamel) as part of the [AGB-Check](project) project.
+
+For citation, please use:
+```
+@InProceedings{schamel-EtAl:2022:ECNLP,
+  author    = {Schamel, Tobias and Braun, Daniel  and  Matthes, Florian},
+  title     = {Structured Extraction of Terms and Conditions from German and English Online Shops},
+  booktitle = {Proceedings of The Fifth Workshop on e-Commerce and NLP (ECNLP 5)},
+  month     = {May},
+  year      = {2022},
+  address   = {Dublin, Ireland},
+  publisher = {Association for Computational Linguistics}
+}
+
+```
+
+## License
+
+The software is provided under the MIT license.
+
+## Acknowledgements
+
+The project was supported by funds of the Federal Ministry for the Environment, Nature Conservation,
+Nuclear Safety and Consumer Protection (BMUV) based on a decision of the Parliament of the Federal
+Republic of Germany via the Federal Office for Agriculture and Food (BLE) under the innovation
+support programme.
+
diff --git a/src/ContentExtractor/ContentExtractor.py b/src/ContentExtractor/ContentExtractor.py
@@ -0,0 +1,17 @@
+from src.ContentExtractor.TreeUtilities import getFrequencyOfStyles, getLowestCommonAncestorNodeOfStyle
+
+# Enum for different content extractors presented in the thesis.
+from src.DOMParser.DOMParser import cleanMainContent
+
+
+# Extracting the main content using the method presented in the thesis.
+# Attributes: body = body HTML node; contentExtractor = content extraction method; threshold = minimum coverage for
+# main content node.
+def getMainContent(bodyDOMNode, contentExtractor, threshold):
+    dic = getFrequencyOfStyles(bodyDOMNode, contentExtractor)
+    max_key = max(dic, key=dic.get)
+    mainContent = getLowestCommonAncestorNodeOfStyle(bodyDOMNode, max_key, dic[max_key], threshold, contentExtractor)
+
+    # Clean main content from empty nodes and other
+    cleanMainContent(mainContent)
+    return mainContent
diff --git a/src/ContentExtractor/ContentExtractorTypes.py b/src/ContentExtractor/ContentExtractorTypes.py
@@ -0,0 +1,8 @@
+from enum import Enum
+
+# Different types of content extractors presented in Section 4.4.
+class ContentExtractor(Enum):
+    NaiveStyle = 1
+    RenderedStyle = 2
+    NaiveStyleAndShortTextExclusion = 3
+    RenderedStyleAndShortTextExclusion = 4