diff --git a/jobfunnel/__init__.py b/jobfunnel/__init__.py index 2d31b1c3..503eeb92 100644 --- a/jobfunnel/__init__.py +++ b/jobfunnel/__init__.py @@ -1 +1 @@ -__version__ = '2.1.3' +__version__ = '2.1.4' diff --git a/jobfunnel/__main__.py b/jobfunnel/__main__.py index de9b12b1..cfa0fe72 100755 --- a/jobfunnel/__main__.py +++ b/jobfunnel/__main__.py @@ -25,6 +25,7 @@ def main(): print(e.strerror) sys.exit() + # init class + logging jf = JobFunnel(config) jf.init_logging() diff --git a/jobfunnel/config/parser.py b/jobfunnel/config/parser.py index d331dfb6..b28beb68 100644 --- a/jobfunnel/config/parser.py +++ b/jobfunnel/config/parser.py @@ -64,7 +64,7 @@ def parse_cli(): dest='domain', type=str, required=False, - help='domain value for a region ') + help='domain value for a region ') parser.add_argument('-r', dest='random', @@ -143,6 +143,13 @@ def parse_cli(): required=False, default=None, help='save duplicates popped by tf_idf filter to file') + parser.add_argument('--max_listing_days', + dest='max_listing_days', + type=int, + default=None, + required=False, + help='The maximum number of days old a job can be.' + '(i.e pass 30 to filter out jobs older than a month)') return parser.parse_args() @@ -178,6 +185,8 @@ def cli_to_yaml(cli): if cli.proxy is not None: yaml['proxy'] = split_url(cli.proxy) + if cli.max_listing_days is not None: + yaml['max_listing_days'] = cli.max_listing_days return yaml @@ -290,5 +299,7 @@ def parse_config(): # check if proxy has not been set yet (optional) if 'proxy' not in config: config['proxy'] = None + if 'max_listing_days' not in config: + config['max_listing_days'] = None return config diff --git a/jobfunnel/config/valid_options.py b/jobfunnel/config/valid_options.py index 2914f2d4..6697ba4c 100644 --- a/jobfunnel/config/valid_options.py +++ b/jobfunnel/config/valid_options.py @@ -25,13 +25,15 @@ 'converge': [bool] }, 'proxy': [ - None, + None, { 'protocol': str, 'ip_address': str, 'port': str } - ] + ], + 'max_listing_days':[int] + } PROVIDERS = ['glassdoor', 'indeed', 'monster'] diff --git a/jobfunnel/config/validate.py b/jobfunnel/config/validate.py index 28d72836..5ded6007 100644 --- a/jobfunnel/config/validate.py +++ b/jobfunnel/config/validate.py @@ -68,3 +68,7 @@ def validate_config(config): # check validity of delay settings validate_delay(config['delay_config']) + + #check the validity of max_listing_days settings + if(config['max_listing_days'] is not None and config['max_listing_days']<0): + raise ConfigError('max_listing_days') diff --git a/jobfunnel/glassdoor.py b/jobfunnel/glassdoor.py index 1803bd8d..99e278b5 100644 --- a/jobfunnel/glassdoor.py +++ b/jobfunnel/glassdoor.py @@ -282,14 +282,15 @@ def scrape(self): # key by id self.scrape_data[str(job['id'])] = job - # apply job pre-filter before scraping blurbs - super().pre_filter(self.scrape_data, self.provider) + # Do not change the order of the next three statements if you want date_filter to work + # stores references to jobs in list to be used in blurb retrieval scrape_list = [i for i in self.scrape_data.values()] - # converts job date formats into a standard date format post_date_from_relative_post_age(scrape_list) + # apply job pre-filter before scraping blurbs + super().pre_filter(self.scrape_data, self.provider) # checks if delay is set or not, then extracts blurbs from job links if self.delay_config is not None: diff --git a/jobfunnel/jobfunnel.py b/jobfunnel/jobfunnel.py index e2334b8d..d5f795fa 100755 --- a/jobfunnel/jobfunnel.py +++ b/jobfunnel/jobfunnel.py @@ -18,7 +18,7 @@ from requests import Session from .tools.delay import delay_alg -from .tools.filters import tfidf_filter, id_filter +from .tools.filters import tfidf_filter, id_filter, date_filter from .tools.tools import proxy_dict_to_url # setting job status to these words removes them from masterlist + adds to @@ -39,6 +39,8 @@ class JobFunnel(object): filters """ def __init__(self, args): + #The maximum number of days old a job can be + self.max_listing_days = args['max_listing_days'] # paths self.master_list_path = args['master_list_path'] self.filterlist_path = args['filter_list_path'] @@ -230,6 +232,9 @@ def update_filterjson(self): def pre_filter(self, data: Dict[str, dict], provider): """function called by child classes that applies multiple filters before getting job blurbs""" + #call date_filter if it is turned on + if self.max_listing_days is not None: + date_filter(data, self.max_listing_days) # call id_filter for master and duplicate lists, if they exist if os.path.isfile(self.master_list_path): id_filter(data, self.read_csv(self.master_list_path), diff --git a/jobfunnel/monster.py b/jobfunnel/monster.py index 17233f1e..603c8635 100644 --- a/jobfunnel/monster.py +++ b/jobfunnel/monster.py @@ -211,14 +211,14 @@ def scrape(self): # key by id self.scrape_data[str(job['id'])] = job - # apply job pre-filter before scraping blurbs - super().pre_filter(self.scrape_data, self.provider) - + # Do not change the order of the next three statements if you want date_filter to work + # stores references to jobs in list to be used in blurb retrieval scrape_list = [i for i in self.scrape_data.values()] - # converts job date formats into a standard date format post_date_from_relative_post_age(scrape_list) + # apply job pre-filter before scraping blurbs + super().pre_filter(self.scrape_data, self.provider) threads = ThreadPoolExecutor(max_workers=8) # checks if delay is set or not, then extracts blurbs from job links diff --git a/jobfunnel/tools/filters.py b/jobfunnel/tools/filters.py index 77f5a3a9..8833c87f 100644 --- a/jobfunnel/tools/filters.py +++ b/jobfunnel/tools/filters.py @@ -1,12 +1,35 @@ import nltk import logging - +from datetime import datetime, date, timedelta from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from typing import Dict, Optional from numpy import delete as np_delete, max as np_max, fill_diagonal +def date_filter(cur_dict: Dict[str, dict], number_of_days: int): + """Filter out jobs that are older than number_of_days + The assumed date format is yyyy-mm-dd + Args: + cur_dict: today's job scrape dict + number_of_days: how many days old a job can be + """ + if number_of_days<0 or cur_dict is None: + return + print("date_filter running") + cur_job_ids = [job['id'] for job in cur_dict.values()] + #calculate the oldest date a job can be + threshold_date = datetime.now() - timedelta(days=number_of_days) + for job_id in cur_job_ids: + #get the date from job with job_id + job_date = datetime.strptime(cur_dict[job_id]['date'], '%Y-%m-%d') + #if this job is older than threshold_date, delete it from current scrape + if job_date - Filter undesired companies by providing your own `yaml` configuration and adding them to the black list (see `JobFunnel/jobfunnel/config/settings.yaml`). +Filter undesired companies by providing your own `yaml` configuration and adding them to the black list(see `JobFunnel/jobfunnel/config/settings.yaml`). + +* **Filtering Old Jobs**
+ Filter jobs that you think are too old: + `funnel -s JobFunnel/demo/settings.yaml --max_listing_days 30` will filter out job listings that are older than 30 days. + * **Automating Searches**
JobFunnel can be easily automated to run nightly with [crontab][cron]