Skip to content

Commit

Permalink
Cleanse GCI data
Browse files Browse the repository at this point in the history
Before the Google Code-in data can be stored in the repository,
the data of students who are just starting needs to be removed,
the status types need to be simplified to reduce side channels,
the unpublished tasks need to be removed, and task mentor list
needs to be removed.

Related to coala#3
  • Loading branch information
jayvdb committed Dec 16, 2017
1 parent b0d2d0e commit c06915d
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 11 deletions.
1 change: 1 addition & 0 deletions .ci/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ bash orgname.sh
mkdir private _site public

python manage.py fetch_gci_task_data private
python manage.py cleanse_gci_task_data private public

python activity/scraper.py || true

Expand Down
2 changes: 1 addition & 1 deletion gci/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

GCI_DATA_DIR = os.path.join(
os.path.dirname(__file__), '..',
'private'
'public'
)


Expand Down
43 changes: 43 additions & 0 deletions gci/management/commands/cleanse_gci_task_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from collections import OrderedDict
import os.path

from ruamel.yaml import YAML

from django.core.management.base import BaseCommand

from gci.students import (
_get_instances,
_get_tasks,
cleanse_instances,
cleanse_tasks,
)


class Command(BaseCommand):
args = ''
help = 'Cleanse GCI data'

def add_arguments(self, parser):
parser.add_argument('input_dir', nargs='?', type=str)
parser.add_argument('output_dir', nargs='?', type=str)

def handle(self, *args, **options):
input_dir = options.get('input_dir')
output_dir = options.get('output_dir')

yaml = YAML()

with open(os.path.join(input_dir, 'tasks.yaml'), 'r') as f:
tasks = yaml.load(f)

with open(os.path.join(input_dir, 'instances.yaml'), 'r') as f:
instances = yaml.load(f)

tasks = cleanse_tasks(tasks)
instances = cleanse_instances(instances)

with open(os.path.join(output_dir, 'tasks.yaml'), 'w') as f:
yaml.dump(tasks, f)

with open(os.path.join(output_dir, 'instances.yaml'), 'w') as f:
yaml.dump(instances, f)
56 changes: 46 additions & 10 deletions gci/students.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@
from .gitorg import get_issue


PRIVATE_INSTANCE_STATUSES = (
'ABANDONED',
'OUT_OF_TIME',
'PENDING_PARENTAL_CONSENT',
'UNASSIGNED_BY_MENTOR',
)

PRIVATE_INSTANCE_ATTRIBUTES = (
'modified',
'deadline',
)

_client = None
_org = {}
_tasks = {}
Expand Down Expand Up @@ -78,6 +90,37 @@ def get_instances():
return _instances


def cleanse_tasks(tasks):
cleansed_tasks = dict(
(task_id, task)
for task_id, task
in tasks.items()
if task['status'] == 2
)

for task in cleansed_tasks.values():
del task['mentors']

return cleansed_tasks


def cleanse_instances(instances):
cleansed_instances = dict(
(instance_id, instance)
for instance_id, instance
in instances.items()
if instance['status'] not in PRIVATE_INSTANCE_STATUSES
)

for instance in cleansed_instances.values():
if instance['status'] != 'COMPLETED':
instance['status'] = 'CLAIMED'
for key in PRIVATE_INSTANCE_ATTRIBUTES:
del instance[key]

return cleansed_instances


def get_students():
students = {}
for _, instance in get_instances().items():
Expand All @@ -99,17 +142,8 @@ def get_students():
student['instances'].append(instance)


def get_effective_students(students):
for student in list(students):
instances = student['instances']
instances = [instance for instance in instances
if instance['status'] != 'ABANDONED']
if instances:
yield student


def get_issue_related_students(students):
for student in list(get_effective_students(students)):
for student in list(get_students(students)):
instances = student['instances']
for instance in instances:
task = get_task(instance['task_definition_id'])
Expand Down Expand Up @@ -142,5 +176,7 @@ def get_linked_students(students):
(task_id, url, ', '.join(issue.assignees)))
else:
student['username'] = issue.assignees[0]
print('student %s is %s because of %s' %
(student['id'], issue.assignees[0], url))
yield student
break

0 comments on commit c06915d

Please sign in to comment.