diff --git a/jobfunnel/__init__.py b/jobfunnel/__init__.py
index 2d31b1c3..503eeb92 100644
--- a/jobfunnel/__init__.py
+++ b/jobfunnel/__init__.py
@@ -1 +1 @@
-__version__ = '2.1.3'
+__version__ = '2.1.4'
diff --git a/jobfunnel/__main__.py b/jobfunnel/__main__.py
index de9b12b1..cfa0fe72 100755
--- a/jobfunnel/__main__.py
+++ b/jobfunnel/__main__.py
@@ -25,6 +25,7 @@ def main():
print(e.strerror)
sys.exit()
+
# init class + logging
jf = JobFunnel(config)
jf.init_logging()
diff --git a/jobfunnel/config/parser.py b/jobfunnel/config/parser.py
index d331dfb6..b28beb68 100644
--- a/jobfunnel/config/parser.py
+++ b/jobfunnel/config/parser.py
@@ -64,7 +64,7 @@ def parse_cli():
dest='domain',
type=str,
required=False,
- help='domain value for a region ')
+ help='domain value for a region ')
parser.add_argument('-r',
dest='random',
@@ -143,6 +143,13 @@ def parse_cli():
required=False,
default=None,
help='save duplicates popped by tf_idf filter to file')
+ parser.add_argument('--max_listing_days',
+ dest='max_listing_days',
+ type=int,
+ default=None,
+ required=False,
+ help='The maximum number of days old a job can be.'
+ '(i.e pass 30 to filter out jobs older than a month)')
return parser.parse_args()
@@ -178,6 +185,8 @@ def cli_to_yaml(cli):
if cli.proxy is not None:
yaml['proxy'] = split_url(cli.proxy)
+ if cli.max_listing_days is not None:
+ yaml['max_listing_days'] = cli.max_listing_days
return yaml
@@ -290,5 +299,7 @@ def parse_config():
# check if proxy has not been set yet (optional)
if 'proxy' not in config:
config['proxy'] = None
+ if 'max_listing_days' not in config:
+ config['max_listing_days'] = None
return config
diff --git a/jobfunnel/config/valid_options.py b/jobfunnel/config/valid_options.py
index 2914f2d4..6697ba4c 100644
--- a/jobfunnel/config/valid_options.py
+++ b/jobfunnel/config/valid_options.py
@@ -25,13 +25,15 @@
'converge': [bool]
},
'proxy': [
- None,
+ None,
{
'protocol': str,
'ip_address': str,
'port': str
}
- ]
+ ],
+ 'max_listing_days':[int]
+
}
PROVIDERS = ['glassdoor', 'indeed', 'monster']
diff --git a/jobfunnel/config/validate.py b/jobfunnel/config/validate.py
index 28d72836..5ded6007 100644
--- a/jobfunnel/config/validate.py
+++ b/jobfunnel/config/validate.py
@@ -68,3 +68,7 @@ def validate_config(config):
# check validity of delay settings
validate_delay(config['delay_config'])
+
+ #check the validity of max_listing_days settings
+ if(config['max_listing_days'] is not None and config['max_listing_days']<0):
+ raise ConfigError('max_listing_days')
diff --git a/jobfunnel/glassdoor.py b/jobfunnel/glassdoor.py
index 1803bd8d..99e278b5 100644
--- a/jobfunnel/glassdoor.py
+++ b/jobfunnel/glassdoor.py
@@ -282,14 +282,15 @@ def scrape(self):
# key by id
self.scrape_data[str(job['id'])] = job
- # apply job pre-filter before scraping blurbs
- super().pre_filter(self.scrape_data, self.provider)
+ # Do not change the order of the next three statements if you want date_filter to work
+
# stores references to jobs in list to be used in blurb retrieval
scrape_list = [i for i in self.scrape_data.values()]
-
# converts job date formats into a standard date format
post_date_from_relative_post_age(scrape_list)
+ # apply job pre-filter before scraping blurbs
+ super().pre_filter(self.scrape_data, self.provider)
# checks if delay is set or not, then extracts blurbs from job links
if self.delay_config is not None:
diff --git a/jobfunnel/jobfunnel.py b/jobfunnel/jobfunnel.py
index e2334b8d..d5f795fa 100755
--- a/jobfunnel/jobfunnel.py
+++ b/jobfunnel/jobfunnel.py
@@ -18,7 +18,7 @@
from requests import Session
from .tools.delay import delay_alg
-from .tools.filters import tfidf_filter, id_filter
+from .tools.filters import tfidf_filter, id_filter, date_filter
from .tools.tools import proxy_dict_to_url
# setting job status to these words removes them from masterlist + adds to
@@ -39,6 +39,8 @@ class JobFunnel(object):
filters """
def __init__(self, args):
+ #The maximum number of days old a job can be
+ self.max_listing_days = args['max_listing_days']
# paths
self.master_list_path = args['master_list_path']
self.filterlist_path = args['filter_list_path']
@@ -230,6 +232,9 @@ def update_filterjson(self):
def pre_filter(self, data: Dict[str, dict], provider):
"""function called by child classes that applies multiple filters
before getting job blurbs"""
+ #call date_filter if it is turned on
+ if self.max_listing_days is not None:
+ date_filter(data, self.max_listing_days)
# call id_filter for master and duplicate lists, if they exist
if os.path.isfile(self.master_list_path):
id_filter(data, self.read_csv(self.master_list_path),
diff --git a/jobfunnel/monster.py b/jobfunnel/monster.py
index 17233f1e..603c8635 100644
--- a/jobfunnel/monster.py
+++ b/jobfunnel/monster.py
@@ -211,14 +211,14 @@ def scrape(self):
# key by id
self.scrape_data[str(job['id'])] = job
- # apply job pre-filter before scraping blurbs
- super().pre_filter(self.scrape_data, self.provider)
-
+ # Do not change the order of the next three statements if you want date_filter to work
+
# stores references to jobs in list to be used in blurb retrieval
scrape_list = [i for i in self.scrape_data.values()]
-
# converts job date formats into a standard date format
post_date_from_relative_post_age(scrape_list)
+ # apply job pre-filter before scraping blurbs
+ super().pre_filter(self.scrape_data, self.provider)
threads = ThreadPoolExecutor(max_workers=8)
# checks if delay is set or not, then extracts blurbs from job links
diff --git a/jobfunnel/tools/filters.py b/jobfunnel/tools/filters.py
index 77f5a3a9..8833c87f 100644
--- a/jobfunnel/tools/filters.py
+++ b/jobfunnel/tools/filters.py
@@ -1,12 +1,35 @@
import nltk
import logging
-
+from datetime import datetime, date, timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict, Optional
from numpy import delete as np_delete, max as np_max, fill_diagonal
+def date_filter(cur_dict: Dict[str, dict], number_of_days: int):
+ """Filter out jobs that are older than number_of_days
+ The assumed date format is yyyy-mm-dd
+ Args:
+ cur_dict: today's job scrape dict
+ number_of_days: how many days old a job can be
+ """
+ if number_of_days<0 or cur_dict is None:
+ return
+ print("date_filter running")
+ cur_job_ids = [job['id'] for job in cur_dict.values()]
+ #calculate the oldest date a job can be
+ threshold_date = datetime.now() - timedelta(days=number_of_days)
+ for job_id in cur_job_ids:
+ #get the date from job with job_id
+ job_date = datetime.strptime(cur_dict[job_id]['date'], '%Y-%m-%d')
+ #if this job is older than threshold_date, delete it from current scrape
+ if job_date
- Filter undesired companies by providing your own `yaml` configuration and adding them to the black list (see `JobFunnel/jobfunnel/config/settings.yaml`).
+Filter undesired companies by providing your own `yaml` configuration and adding them to the black list(see `JobFunnel/jobfunnel/config/settings.yaml`).
+
+* **Filtering Old Jobs**
+ Filter jobs that you think are too old:
+ `funnel -s JobFunnel/demo/settings.yaml --max_listing_days 30` will filter out job listings that are older than 30 days.
+
* **Automating Searches**
JobFunnel can be easily automated to run nightly with [crontab][cron]