Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Intake-ifying osm #20

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,6 @@ before_install:
script:
- doit small_data_setup --name $DIR
- cd $DIR
- if ! anaconda-project list-downloads | grep -q 'No downloads'; then
if ! [ -d data ]; then
echo 'FAIL needs data and no test data found' && exit 1;
fi;
fi
- $RUN_LINT && anaconda-project run lint || echo "skipping lint"
- $RUN_TEST && anaconda-project run test || echo "skipping test"
- cd ..
Expand Down
108 changes: 78 additions & 30 deletions dodo.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def task_ecosystem_setup():
return {'actions': [
"conda config --set always_yes True",
"conda update conda",
"conda install anaconda-project 'tornado<5.0'",
"conda install anaconda-project 'tornado<5.0' pyyaml",
]}


Expand All @@ -25,16 +25,28 @@ def task_ecosystem_setup():
'default': 'attractors'
}

def _prepare_paths(root, name, test_data):
def _prepare_paths(root, name, test_data, filename='catalog.yml'):
if root == '':
root = os.getcwd()
root = os.path.abspath(root)
test_data = test_data if os.path.isabs(test_data) else os.path.join(root, test_data)
test_path = os.path.join(test_data, name)
project_path = os.path.join(root, name)
return {
'real': os.path.join(root, name, 'data'),
'test': os.path.join(test_data, name),
'project': project_path,
'real': os.path.join(project_path, 'data'),
'test': test_path,
'cat_real': os.path.join(project_path, filename),
'cat_test': os.path.join(test_path, filename),
'cat_tmp': os.path.join(project_path, 'tmp_' + filename),
}

def _has_download(path, filename='anaconda-project.yml'):
import yaml
with open(os.path.join(path, filename), 'r') as f:
text = yaml.load(f, Loader=yaml.FullLoader)
return bool(text.get('downloads', {}))

# From https://stackoverflow.com/a/24860799/4021797
class dircmp(filecmp.dircmp):
"""
Expand Down Expand Up @@ -64,26 +76,51 @@ def is_same(dir1, dir2):
return False
return True


def task_small_data_setup():
"""Copy small versions of the data from test_data"""

def copy_test_data(root='', name='attractors', test_data='test_data'):
print('Copying test data for {} ...'.format(name))
paths = _prepare_paths(root, name, test_data)
def copy_test_data(root='', name='attractors', test_data='test_data', cat_filename='catalog.yml'):
print('Setting up test data for {}:'.format(name))

if not os.path.exists(paths['test']):
print("Warning: No test_data found for {} in {}".format(name, paths['test']))
return
paths = _prepare_paths(root, name, test_data, cat_filename)
has_catalog = os.path.exists(paths['cat_real'])
has_download = _has_download(paths['project'])

if not os.path.exists(paths['test']) or not os.listdir(paths['test']):
if has_catalog or has_download:
raise ValueError("Fail: {} has no test_data".format(name))
else:
print(" Nothing to do: Test data not needed for {}".format(name))
print("Done!")
return

if has_catalog and not os.path.exists(paths['cat_test']):
raise ValueError("Fail: {} contains intake catalog, but "
"no catalog found in test_data".format(name))

if has_catalog:
print('* Copying intake catalog ...')

# move real catalog file to tmp if tmp doesn't exist
if os.path.exists(paths['cat_tmp']):
raise ValueError("Fail: Temp file already exists - try 'doit small_data_cleanup'")
os.rename(paths['cat_real'], paths['cat_tmp'])

# move test catalog to project directory
shutil.copyfile(paths['cat_test'], paths['cat_real'])
print(' Intake catalog successfully copied')

print('* Copying test data ...')
if os.path.exists(paths['real']) and os.listdir(paths['real']):
matching_files = filecmp.dircmp(paths['test'], paths['real']).same_files
if os.listdir(paths['real']) != matching_files:
raise ValueError("Fail: Data files already exist in {}".format(paths['real']))
else:
print("Nothing to do: Test data already in {}".format(paths['real']))
return
copy_tree(paths['test'], paths['real'])
print(" Nothing to do: Test data already in {}".format(paths['real']))
else:
copy_tree(paths['test'], paths['real'])
print(' Test data sucessfully copied')

print("Done!")

return {'actions': [copy_test_data], 'params': [name_param]}
Expand All @@ -92,27 +129,38 @@ def copy_test_data(root='', name='attractors', test_data='test_data'):
def task_small_data_cleanup():
"""Remove test_data from real data path"""

def remove_test_data(root='', name='attractors', test_data='test_data'):
print('Removing test data for {} ...'.format(name))
paths = _prepare_paths(root, name, test_data)
def remove_test_data(root='', name='attractors', test_data='test_data',
cat_filename='catalog.yml'):
print('Cleaning up test data for {}:'.format(name))
paths = _prepare_paths(root, name, test_data, cat_filename)

if not os.path.exists(paths['test']):
print("Nothing to do: No test_data found for {} in {}".format(name, paths['test']))
return
if os.path.exists(paths['cat_real']):
print("* Replacing intake catalog ...")

if not os.path.exists(paths['real']):
print("Nothing to do: No data found in {}".format(paths['real']))
return
if not os.path.exists(paths['cat_tmp']):
print(" Nothing to do: No temp file found. Use git status to "
"check that you have the real catalog at {}".format(paths['cat_real']))
else:
os.remove(paths['cat_real'])
os.rename(paths['cat_tmp'], paths['cat_real'])
print(' Intake catalog successfully cleaned')

if not os.listdir(paths['real']):
print("No data found in {}, just removing empty dir".format(paths['real']))
os.rmdir(paths['real'])
return
print('* Removing test data ...')

if not is_same(paths['test'], paths['real']):
raise ValueError("Fail: Data files at {} are not identical to test, so they shouldn't be deleted.".format(paths['real']))
if not os.path.exists(paths['test']):
print(" Nothing to do: No test_data found for {} in {}".format(name, paths['test']))
elif not os.path.exists(paths['real']):
print(" Nothing to do: No data found in {}".format(paths['real']))
elif not os.listdir(paths['real']):
os.rmdir(paths['real'])
print(" No data found in {}, just removed empty dir".format(paths['real']))
elif not is_same(paths['test'], paths['real']):
raise ValueError("Fail: Data files at {} are not identical to test, "
"so they shouldn't be deleted.".format(paths['real']))
else:
shutil.rmtree(paths['real'])
print(' Test data successfully removed')

shutil.rmtree(paths['real'])
print("Done!")

return {'actions': [remove_test_data], 'params': [name_param]}
44 changes: 36 additions & 8 deletions osm/anaconda-project.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# To reproduce: install 'anaconda-project', then 'anaconda-project run'
name: osm
description: Datashading 2.7-billion-point Open Street Map database
description: Datashading Open Street Map database
maintainers:
- jbednar

Expand All @@ -22,22 +22,50 @@ packages: *deps

commands:
notebook:
description: Datashading 1-billion-point Open Street Map database
notebook: osm-1billion.ipynb
env_spec: spatial
osm-3billion:
description: Datashading 2.7-billion-point Open Street Map database
notebook: osm.ipynb
test:
unix: pytest --nbsmoke-run -k *.ipynb --ignore envs
windows: pytest --nbsmoke-run -k *.ipynb --ignore envs
unix: pytest --nbsmoke-run --ignore envs
windows: pytest --nbsmoke-run --ignore envs
env_spec: test
lint:
unix: pytest --nbsmoke-lint -k *.ipynb --ignore envs
windows: pytest --nbsmoke-lint -k *.ipynb --ignore envs
unix: pytest --nbsmoke-lint --ignore envs
windows: pytest --nbsmoke-lint --ignore envs
env_spec: test

variables: {}
variables:
INTAKE_CACHE_DIR: data
downloads: {}

env_specs:
default: {}
spatial:
channels:
- pyviz
- intake
packages:
- holoviews
- intake
- ipywidgets
- jupyter
- tqdm
- pip:
- git+https://github.com/intake/intake-parquet.git@jsignell/spatial#egg=intake_parquet
jbednar marked this conversation as resolved.
Show resolved Hide resolved
test:
channels:
- pyviz
- intake
packages:
- nbsmoke ==0.2.8
- pytest ==4.4.1
- nbsmoke ==0.2.8
- pytest ==4.4.1
- holoviews
- intake
- ipywidgets
- jupyter
- tqdm
- pip:
- git+https://github.com/intake/intake-parquet.git@jsignell/spatial#egg=intake_parquet
10 changes: 10 additions & 0 deletions osm/catalog.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
sources:
osm_one_billion:
description: Open Street Map database as a parquet file (1-billion-point)
driver: parquet
cache:
- argkey: urlpath
regex: https://s3.amazonaws.com/datashader-data
type: compressed
args:
urlpath: https://s3.amazonaws.com/datashader-data/osm-1billion.parq.zip
23 changes: 20 additions & 3 deletions osm-1billion.ipynb → osm/osm-1billion.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@
"metadata": {},
"outputs": [],
"source": [
"import intake\n",
"import datashader as ds\n",
"import datashader.transfer_functions as tf\n",
"from datashader import spatial\n",
"\n",
"from colorcet import fire"
]
Expand All @@ -39,8 +39,25 @@
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"df = spatial.read_parquet('../data/osm-1billion.parq')\n",
"cat = intake.open_catalog('./catalog.yml')\n",
"data_entry = cat.osm_one_billion\n",
"data_entry"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that the file isn't downloaded yet. The following step will take some time:"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do you know at this point whether it's been downloaded yet? Won't it be cached? Seems like it should say instead that "Note that the first time this cell is executed, the file will take some time to download, but subsequent runs will skip that step".

]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = data_entry.to_spatial()\n",
"df = df.persist()"
]
},
Expand Down
6 changes: 6 additions & 0 deletions test_data/osm/catalog.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
sources:
osm_one_billion:
description: Test data points to same fake osm-3billion file
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as what? Try to reword in a way that makes sense when reading just this one file.

driver: parquet
args:
urlpath: '{{ CATALOG_DIR }}/data/osm-3billion.parq'