EleutherAI · lfoppiano · Sep 20, 2023 · Sep 20, 2023 · Sep 20, 2023 · Sep 24, 2023
diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml
@@ -0,0 +1,35 @@
+name: Build unstable
+
+on: [push]
+
+concurrency: 
+  group: unstable
+#  cancel-in-progress: true
+
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.9"
+    - name: Cleanup more disk space
+      run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install --upgrade flake8 pytest pycodestyle
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+#    - name: Test with pytest
+#      run: |
+#        python -m pytest --rootdir .
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.idea
+.env
diff --git a/README.md b/README.md
@@ -3,26 +3,41 @@ A python tool for downloading & processing the [stackexchange data dumps](https:
 
 Download the whole processed dataset [here](https://eaidata.bmk.sh/data/stackexchange_dataset.tar)
 
-# Setup
+## Setup
 ```
 git clone https://github.com/EleutherAI/stackexchange_dataset/
 cd stackexchange_dataset
 pip install -r requirements.txt
 ```
-# Usage
+## Usage
 
-To download *every* stackexchange dump & parse to text, simply run
+
+### List all available StackExchagne dumps
+
+```
+python3 main.py --list 
+```
+
+
+
+### Download every StackExchange dumps 
+
+To download *every* stackexchange dumps & parse to text, simply run
 
 ```
 python3 main.py --names all
 ```
 
+### Download a single StackExchange dump 
+
 To download only a single stackexchange, you can add the name as an optional argument. E.G: 
 
 ```
 python3 main.py --names security.stackexchange
 ```
 
+### Download a list of StackExchange dumps
+
 To download a list of multiple stackexchanges, you can add the names separated by commas. E.G:
 
 ```
@@ -31,6 +46,17 @@ python3 main.py --names ru.stackoverflow,money.stackexchange
 
 The name should be the url of the stackoverflow site, minus `http(s)://` and `.com`. You can view all available stackoverflow dumps [here](https://archive.org/download/stackexchange).
 
+## List available sources in Stack Exchange
+
+this will list all the available sources: 
+
+```
+python3 main.py --list
+```
+
+They will be listed as list, which could be parsed with `grep` and other batch utilities.
+
+
 ## All Usage Options:
 
 ```
@@ -47,6 +73,19 @@ optional arguments:
                  *every* stackoverflow site
 ```
 
+### Proxy support 
+
+If you need to pass through a proxy, you can configure an `.env` file and add as follow: 
+
+```
+HTTP_PROXY=http://proxy:port
+http_proxy=http://proxy:port
+HTTPS_PROXY=http://proxy:port
+https_proxy=http://proxy:port
+NO_PROXY=address to ignore,localhost
+no_proxy=address to ignore,localhost
+```
+
 # TODO:
 
 - [ ] should we add metadata to the text (i.e name of stackexchange & tags)?

diff --git a/main.py b/main.py
@@ -1,56 +1,82 @@
-import argparse, traceback
+import argparse
+import os
+import traceback
+from itertools import repeat
 from multiprocessing import Pool, cpu_count
-from utils import *
+
+import dotenv
+from lm_dataformat import Archive, JSONArchive, TextArchive, LM_DATAFORMAT_FORMAT, TEXT_FORMAT, SUPPORTED_FORMATS, \
+    JSON_FORMAT
+
 from downloader import Stack_Exchange_Downloader
 from pairer import QA_Pairer
-import os
-from itertools import repeat
-from lm_dataformat import Archive
-import zipfile
 
+dotenv.load_dotenv(override=True)
 
-def download_and_process_single(name, out_format, min_score, max_responses):
+
+def download_and_process_single(name, out_format, min_score, max_responses, keep_sources=False):
     try:
         name = name.strip().lower()
         os.makedirs("dumps", exist_ok=True)
         s = Stack_Exchange_Downloader(name)
+        if name not in s.sites:
+            similar_entries = list(filter(lambda key: key.startswith(name) or key.endswith(name), s.sites.keys()))
+            print("StackExchange source not found. Perhaps you meant", similar_entries)
+            return
+
         path_to_xml = "dumps/{}/Posts.xml".format(name)
         if name != "stackoverflow":
             path_to_7z = "dumps/{}.7z".format(s.sites[name]["url"])
         else:
             path_to_7z = "dumps/stackoverflow.com-Posts.7z"
-        out_folder = "out".format(name)
+
+        out_folder = "out/{}".format(name)
         os.makedirs(out_folder, exist_ok=True)
         if not os.path.isfile(path_to_7z):
             # download 7z if it's not downloaded already
             s.download()
+
         if not os.path.isfile(path_to_xml):
             # extract 7z if it's not extracted already
             s.extract()
-        if out_format == "lm_dataformat":
+
+        if out_format == LM_DATAFORMAT_FORMAT:
             archiver = Archive(out_folder)
-        elif out_format == "zip":
-            archiver = zipfile.ZipFile('{}/{}.zip'.format(out_folder, name), 'a')
+        elif out_format == TEXT_FORMAT:
+            archiver = TextArchive(out_folder)
+        elif out_format == JSON_FORMAT:
+            archiver = JSONArchive(out_folder)
         else:
             archiver = None
-        qa = QA_Pairer(path_to_xml, name=name, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses)
-        qa.main()
-        if out_format == "lm_dataformat":
-            archiver.commit(name)
-        elif out_format == "zip":
-            archiver.close()
-        try:
-            os.remove(path_to_7z)
-        except FileNotFoundError:
-            print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"]))
-        filelist = [f for f in os.listdir("dumps/{}".format(name)) if f.endswith(".xml")]
+
+        qa = QA_Pairer(path_to_xml, name=name, out_format=out_format, archiver=archiver, min_score=min_score,
+                       max_responses=max_responses)
+        qa.process()
+        archiver.commit(name)
+
+        if not keep_sources:
+            try:
+                os.remove(path_to_7z)
+            except FileNotFoundError:
+                print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"]))
+
+        directory_uncompressed = "dumps/{}".format(name)
+        filelist = [f for f in os.listdir(directory_uncompressed)
+                    if f.endswith(".xml")]
         for f in filelist:
-            os.remove(os.path.join("dumps/{}".format(name), f))
+            os.remove(os.path.join(directory_uncompressed, f))
+        os.removedirs(directory_uncompressed)
     except:
         traceback.print_exc()
 
 
 def main(args):
+    if args.list:
+        s = Stack_Exchange_Downloader("all")
+        print("List of all the sources of StackExchange: ")
+        print("- " + "\n- ".join(sorted(s.sites.keys())))
+        return
+
     names = args.names.split(',')
     if names[0].strip().lower() == "all":
         s = Stack_Exchange_Downloader("all")
@@ -60,31 +86,51 @@ def main(args):
         # bring stackoverflow to the front so it is always processed first, since it's the largest
         if "stackoverflow" in names:
             names.insert(0, names.pop(names.index("stackoverflow")))
+        # if args.no_zip:
+        #     print("Downloading everything required the output to be compressed. Re-run *without* the option --no-zip.")
+        #     sys.exit(-1)
     print('Downloading and processing stackexchange dumps for {}'.format(names))
     # Download & Process
     # init pool with as many CPUs as available
     cpu_no = cpu_count() - 1
     p = Pool(cpu_no)
-    p.starmap(download_and_process_single, zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses)))
+    p.starmap(download_and_process_single,
+              zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses),
+                  repeat(args.keep_sources)))
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw '
                     'question-answer pair text dataset for Language Models')
+
+    parser.add_argument('--list', help='list of all the sources from stackechange',
+                        required=False, action="store_true")
+
     parser.add_argument('--names', help='names of stackexchanges to download, extract & parse, separated by commas. '
                                         'If "all", will download, extract & parse *every* stackoverflow site',
                         default="3dprinting.stackexchange,3dprinting.meta.stackexchange",
                         type=str)
-    parser.add_argument('--out_format', help='format of out file - if you are processing everything this will need to be '
-                                             'lm_dataformat, as you will run into number of files per directory limits.',
-                        default="zip",
+    parser.add_argument('--out_format',
+                        help='format of out file - if you are processing everything this will need to be '
+                             'lm_dataformat, as you will run into number of files per directory limits.',
+                        default=TEXT_FORMAT,
+                        choices=SUPPORTED_FORMATS,
                         type=str)
-    parser.add_argument('--min_score', help='minimum score of a response in order to be included in the dataset. Default 3.',
+    # parser.add_argument('--no-zip',
+    #                     help="Disable the compression of the output files. Writing plain files might end up in problems with the filesystem",
+    #                     action="store_true",
+    #                     required=False,
+    #                     default=False)
+    parser.add_argument('--min_score',
+                        help='minimum score of a response in order to be included in the dataset. Default 3.',
                         type=int, default=3)
-    parser.add_argument('--max_responses', help='maximum number of responses (sorted by score) to include for each question. '
-                                                'Default 3.', type=int, default=3)
+    parser.add_argument('--max_responses',
+                        help='maximum number of responses (sorted by score) to include for each question. '
+                             'Default 3.', type=int, default=3)
+    parser.add_argument('--keep-sources',
+                        help='Do not clean-up the downloaded source 7z files.',
+                        action="store_true", default=False)
     args = parser.parse_args()
-    main(args)
-
 
+    main(args)