diff --git a/env.example b/env.example index 81e5f97a..863188d2 100644 --- a/env.example +++ b/env.example @@ -8,6 +8,7 @@ export RIKOLTI_DATA=file:///usr/local/airflow/rikolti_data export NUXEO= # ask for a key - required to run the NuxeoFetcher export FLICKR_API_KEY= # ask for a key - required to run the FlickrFetcher export CALISPHERE_ETL_TOKEN= # ask for token - required to run Calisphere Solr Fetcher +export YOUTUBE_API_KEY="" # ask for a key - required to run the YouTube Fetcher # metadata_mapper export SKIP_UNDEFINED_ENRICHMENTS=True diff --git a/metadata_fetcher/fetchers/youtube_fetcher.py b/metadata_fetcher/fetchers/youtube_fetcher.py new file mode 100644 index 00000000..ee7438c8 --- /dev/null +++ b/metadata_fetcher/fetchers/youtube_fetcher.py @@ -0,0 +1,63 @@ +import json +import logging + +from .. import settings +from .Fetcher import Fetcher + +class YoutubeFetcher(Fetcher): + + def __init__(self, params): + super(YoutubeFetcher, self).__init__(params) + + self.harvest_data = params.get("harvest_data", {}) + self.url = self.harvest_data.get("url") + self.next_page_token = params.get("next_page_token") + + def build_fetch_request(self): + """ + We expect to receive a url like these, with the playlistid or id parameter supplied: + - https://www.googleapis.com/youtube/v3/playlistItems?playlistId={id} + - https://www.googleapis.com/youtube/v3/videos?id={id} + """ + + url = ( + f"{self.url.rstrip('/')}" + f"&key={settings.YOUTUBE_API_KEY}" + f"&part=snippet" + f"&maxResults=50" + ) + + if self.next_page_token: + url += f"&pageToken={self.next_page_token}" + + return {"url": url} + + def check_page(self, http_resp) -> int: + data = json.loads(http_resp.content) + items = data.get("items") + + if len(items) > 0: + logging.debug( + f"{self.collection_id}, fetched page {self.write_page} - " + f"{len(items)} hits,-,-,-,-,-" + ) + + return len(items) + + def increment(self, http_resp): + self.write_page = self.write_page + 1 + + data = http_resp.json() + self.next_page_token = data.get("nextPageToken") + + def json(self) -> str: + if not self.next_page_token: + return json.dumps({"finished": True}) + else: + return json.dumps({ + "harvest_type": self.harvest_type, + "collection_id": self.collection_id, + "harvest_data": self.harvest_data, + "write_page": self.write_page, + "next_page_token": self.next_page_token + }) diff --git a/metadata_fetcher/settings.py b/metadata_fetcher/settings.py index 74fca6d7..76dbd28a 100644 --- a/metadata_fetcher/settings.py +++ b/metadata_fetcher/settings.py @@ -10,6 +10,7 @@ NUXEO_TOKEN = os.environ.get('NUXEO') FLICKR_API_KEY = os.environ.get('FLICKR_API_KEY') CALISPHERE_ETL_TOKEN = os.environ.get('CALISPHERE_ETL_TOKEN') +YOUTUBE_API_KEY = os.environ.get('YOUTUBE_API_KEY') for key, value in os.environ.items(): logger.debug(f"{key}={value}") diff --git a/metadata_mapper/mappers/youtube/youtube_mapper.py b/metadata_mapper/mappers/youtube/youtube_mapper.py new file mode 100644 index 00000000..bbc16c1a --- /dev/null +++ b/metadata_mapper/mappers/youtube/youtube_mapper.py @@ -0,0 +1,47 @@ +import json + +from ..mapper import Record, Vernacular + +class YoutubeRecord(Record): + def UCLDC_map(self): + return { + "calisphere-id": self.legacy_couch_db_id.split('--')[1], + "isShownAt": self.map_is_shown_at, + "isShownBy": self.map_is_shown_by, + "description": self.map_description, + "subject": self.map_subject, + "title": self.map_title + } + + def map_is_shown_at(self): + if self.source_metadata.get('id'): + return f"https://www.youtube.com/watch?v={self.source_metadata.get('id')}" + + def map_is_shown_by(self): + thumbnails = self.source_metadata.get("snippet",{}).get("thumbnails",{}) + return thumbnails.get("standard", {}).get("url") + + def map_description(self): + return self.source_metadata.get("snippet", {}).get("description") + + def map_subject(self): + tags = self.source_metadata.get("snippet", {}).get("tags") + + return [{"name": tag} for tag in tags] + + def map_title(self): + return self.source_metadata.get("snippet", {}).get("title") + + +class YoutubeVernacular(Vernacular): + record_cls = YoutubeRecord + + def parse(self, api_response): + def modify_record(record): + record.update({"calisphere-id": f"{self.collection_id}--" + f"{record.get('id')}"}) + return record + + records = json.loads(api_response).get("items") + records = [modify_record(record) for record in json.loads(api_response).get("items")] + return self.get_records(records)