Skip to content
35 changes: 35 additions & 0 deletions .github/workflows/ci-build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: Build unstable

on: [push]

concurrency:
group: unstable
# cancel-in-progress: true


jobs:
build:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: "3.9"
- name: Cleanup more disk space
run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install --upgrade flake8 pytest pycodestyle
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
# - name: Test with pytest
# run: |
# python -m pytest --rootdir .
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.idea
.env
45 changes: 42 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,41 @@ A python tool for downloading & processing the [stackexchange data dumps](https:

Download the whole processed dataset [here](https://eaidata.bmk.sh/data/stackexchange_dataset.tar)

# Setup
## Setup
```
git clone https://github.com/EleutherAI/stackexchange_dataset/
cd stackexchange_dataset
pip install -r requirements.txt
```
# Usage
## Usage

To download *every* stackexchange dump & parse to text, simply run

### List all available StackExchagne dumps

```
python3 main.py --list
```



### Download every StackExchange dumps

To download *every* stackexchange dumps & parse to text, simply run

```
python3 main.py --names all
```

### Download a single StackExchange dump

To download only a single stackexchange, you can add the name as an optional argument. E.G:

```
python3 main.py --names security.stackexchange
```

### Download a list of StackExchange dumps

To download a list of multiple stackexchanges, you can add the names separated by commas. E.G:

```
Expand All @@ -31,6 +46,17 @@ python3 main.py --names ru.stackoverflow,money.stackexchange

The name should be the url of the stackoverflow site, minus `http(s)://` and `.com`. You can view all available stackoverflow dumps [here](https://archive.org/download/stackexchange).

## List available sources in Stack Exchange

this will list all the available sources:

```
python3 main.py --list
```

They will be listed as list, which could be parsed with `grep` and other batch utilities.


## All Usage Options:

```
Expand All @@ -47,6 +73,19 @@ optional arguments:
*every* stackoverflow site
```

### Proxy support

If you need to pass through a proxy, you can configure an `.env` file and add as follow:

```
HTTP_PROXY=http://proxy:port
http_proxy=http://proxy:port
HTTPS_PROXY=http://proxy:port
https_proxy=http://proxy:port
NO_PROXY=address to ignore,localhost
no_proxy=address to ignore,localhost
```

# TODO:

- [ ] should we add metadata to the text (i.e name of stackexchange & tags)?
Expand Down
110 changes: 78 additions & 32 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,82 @@
import argparse, traceback
import argparse
import os
import traceback
from itertools import repeat
from multiprocessing import Pool, cpu_count
from utils import *

import dotenv
from lm_dataformat import Archive, JSONArchive, TextArchive, LM_DATAFORMAT_FORMAT, TEXT_FORMAT, SUPPORTED_FORMATS, \
JSON_FORMAT

from downloader import Stack_Exchange_Downloader
from pairer import QA_Pairer
import os
from itertools import repeat
from lm_dataformat import Archive
import zipfile

dotenv.load_dotenv(override=True)

def download_and_process_single(name, out_format, min_score, max_responses):

def download_and_process_single(name, out_format, min_score, max_responses, keep_sources=False):
try:
name = name.strip().lower()
os.makedirs("dumps", exist_ok=True)
s = Stack_Exchange_Downloader(name)
if name not in s.sites:
similar_entries = list(filter(lambda key: key.startswith(name) or key.endswith(name), s.sites.keys()))
print("StackExchange source not found. Perhaps you meant", similar_entries)
return

path_to_xml = "dumps/{}/Posts.xml".format(name)
if name != "stackoverflow":
path_to_7z = "dumps/{}.7z".format(s.sites[name]["url"])
else:
path_to_7z = "dumps/stackoverflow.com-Posts.7z"
out_folder = "out".format(name)

out_folder = "out/{}".format(name)
os.makedirs(out_folder, exist_ok=True)
if not os.path.isfile(path_to_7z):
# download 7z if it's not downloaded already
s.download()

if not os.path.isfile(path_to_xml):
# extract 7z if it's not extracted already
s.extract()
if out_format == "lm_dataformat":

if out_format == LM_DATAFORMAT_FORMAT:
archiver = Archive(out_folder)
elif out_format == "zip":
archiver = zipfile.ZipFile('{}/{}.zip'.format(out_folder, name), 'a')
elif out_format == TEXT_FORMAT:
archiver = TextArchive(out_folder)
elif out_format == JSON_FORMAT:
archiver = JSONArchive(out_folder)
else:
archiver = None
qa = QA_Pairer(path_to_xml, name=name, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses)
qa.main()
if out_format == "lm_dataformat":
archiver.commit(name)
elif out_format == "zip":
archiver.close()
try:
os.remove(path_to_7z)
except FileNotFoundError:
print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"]))
filelist = [f for f in os.listdir("dumps/{}".format(name)) if f.endswith(".xml")]

qa = QA_Pairer(path_to_xml, name=name, out_format=out_format, archiver=archiver, min_score=min_score,
max_responses=max_responses)
qa.process()
archiver.commit(name)

if not keep_sources:
try:
os.remove(path_to_7z)
except FileNotFoundError:
print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"]))

directory_uncompressed = "dumps/{}".format(name)
filelist = [f for f in os.listdir(directory_uncompressed)
if f.endswith(".xml")]
for f in filelist:
os.remove(os.path.join("dumps/{}".format(name), f))
os.remove(os.path.join(directory_uncompressed, f))
os.removedirs(directory_uncompressed)
except:
traceback.print_exc()


def main(args):
if args.list:
s = Stack_Exchange_Downloader("all")
print("List of all the sources of StackExchange: ")
print("- " + "\n- ".join(sorted(s.sites.keys())))
return

names = args.names.split(',')
if names[0].strip().lower() == "all":
s = Stack_Exchange_Downloader("all")
Expand All @@ -60,31 +86,51 @@ def main(args):
# bring stackoverflow to the front so it is always processed first, since it's the largest
if "stackoverflow" in names:
names.insert(0, names.pop(names.index("stackoverflow")))
# if args.no_zip:
# print("Downloading everything required the output to be compressed. Re-run *without* the option --no-zip.")
# sys.exit(-1)
print('Downloading and processing stackexchange dumps for {}'.format(names))
# Download & Process
# init pool with as many CPUs as available
cpu_no = cpu_count() - 1
p = Pool(cpu_no)
p.starmap(download_and_process_single, zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses)))
p.starmap(download_and_process_single,
zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses),
repeat(args.keep_sources)))


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw '
'question-answer pair text dataset for Language Models')

parser.add_argument('--list', help='list of all the sources from stackechange',
required=False, action="store_true")

parser.add_argument('--names', help='names of stackexchanges to download, extract & parse, separated by commas. '
'If "all", will download, extract & parse *every* stackoverflow site',
default="3dprinting.stackexchange,3dprinting.meta.stackexchange",
type=str)
parser.add_argument('--out_format', help='format of out file - if you are processing everything this will need to be '
'lm_dataformat, as you will run into number of files per directory limits.',
default="zip",
parser.add_argument('--out_format',
help='format of out file - if you are processing everything this will need to be '
'lm_dataformat, as you will run into number of files per directory limits.',
default=TEXT_FORMAT,
choices=SUPPORTED_FORMATS,
type=str)
parser.add_argument('--min_score', help='minimum score of a response in order to be included in the dataset. Default 3.',
# parser.add_argument('--no-zip',
# help="Disable the compression of the output files. Writing plain files might end up in problems with the filesystem",
# action="store_true",
# required=False,
# default=False)
parser.add_argument('--min_score',
help='minimum score of a response in order to be included in the dataset. Default 3.',
type=int, default=3)
parser.add_argument('--max_responses', help='maximum number of responses (sorted by score) to include for each question. '
'Default 3.', type=int, default=3)
parser.add_argument('--max_responses',
help='maximum number of responses (sorted by score) to include for each question. '
'Default 3.', type=int, default=3)
parser.add_argument('--keep-sources',
help='Do not clean-up the downloaded source 7z files.',
action="store_true", default=False)
args = parser.parse_args()
main(args)


main(args)
Loading