From 5958e428a985f0c327c4c05233ede18130697553 Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Thu, 6 Jul 2023 10:59:16 -0700 Subject: [PATCH 1/4] Implement Youtube fetcher --- metadata_fetcher/fetchers/youtube_fetcher.py | 197 +++++++++++++++++++ metadata_fetcher/settings.py | 1 + 2 files changed, 198 insertions(+) create mode 100644 metadata_fetcher/fetchers/youtube_fetcher.py diff --git a/metadata_fetcher/fetchers/youtube_fetcher.py b/metadata_fetcher/fetchers/youtube_fetcher.py new file mode 100644 index 000000000..00bb9309f --- /dev/null +++ b/metadata_fetcher/fetchers/youtube_fetcher.py @@ -0,0 +1,197 @@ +import json +from .Fetcher import Fetcher +import requests +from requests.adapters import HTTPAdapter +from requests.adapters import Retry +from urllib.parse import urlencode +import settings + + +class YoutubeFetcher(Fetcher): + BASE_URL = "https://www.googleapis.com/youtube/v3/" + + def __init__(self, params: dict[str]): + """ + Parameters: + params: dict[str] + """ + super(YoutubeFetcher, self).__init__(params) + + # If `next_url` is a param, we know that this is not the fetch of the + # first page, so skip setting those attributes + if "next_url" in params: + for key in params: + setattr(self, key, params[key]) + return + + self.collection_id = params.get("collection_id") + self.external_id = params.get("harvest_data").get("harvest_extra_data") + # 50 is the max per_page value + self.per_page = 1 + self.page_token = None + self.next_page_token = None + self.next_url = self.get_current_url() + + def get_current_url(self) -> str: + """ + If it's a single video, fetch it, otherwise fetch a page of results from a + playlist or user upload. + + Returns: str + """ + if self.is_single(): + return self.get_videos_by_id_request([self.external_id]) + + params = { + "maxResults": self.per_page, + "part": "contentDetails", + "playlistId": self.external_id, + "pageToken": self.page_token or "" + } + endpoint = "playlistItems" + + return self.build_request_url(endpoint, params) + + def is_single(self) -> bool: + """ + Based on the external_id, determines if it's a single video in the + simplest way possible. + + Returns: bool + """ + return len(self.external_id) == 11 + + def get_videos_by_id_request(self, external_ids: list) -> str: + """ + Parameters: + external_ids: list + + Returns: str + """ + params = { + "part": "snippet", + "id": ",".join(external_ids) + } + endpoint = "videos" + + return self.build_request_url(endpoint, params) + + def build_request_url(self, endpoint: str, params: dict[str]) -> str: + """ + Creates a YouTube API request URL from dictionary of request parameters. + + Parameters: + endpoint: str + params: dict[str] + + Returns: str + """ + params["key"] = settings.YOUTUBE_API_KEY + + return self.BASE_URL + endpoint + "?" + urlencode(params) + + def build_fetch_request(self) -> dict[str]: + """ + Generates arguments for `requests.get()`. + + Returns: dict[str] + """ + request = {"url": self.get_current_url()} + + print( + f"[{self.collection_id}]: Fetching page {self.write_page} " + f"at {request.get('url')}") + + return request + + def aggregate_vernacular_content(self, response: requests.Response) -> str: + """ + If it's a single video, this response is our page content. Otherwise, we + need to iterate the returned records and fetch them by id. + + Parameters: + response: requests.Response + + Returns: str + """ + content = response.text + if self.is_single(): + return content + + items = json.loads(content) + + external_ids = [item.get("contentDetails").get("videoId") + for item in items.get("items")] + + return self.get_video_metadata(external_ids).text + + def get_video_metadata(self, external_ids: list) -> requests.Response: + """ + Performs a request for video info and returns the response. Attempts + retries. + + Parameters: + external_ids: list + + Returns: requests.Response + """ + print( + f"[{self.collection_id}]: Fetching videos {external_ids}" + ) + + session = requests.Session() + retries = Retry(total=3, backoff_factor=2) + session.mount("https://", HTTPAdapter(max_retries=retries)) + return session.get(url=self.get_videos_by_id_request(external_ids)) + + def check_page(self, http_resp: requests.Response) -> int: + """ + Parameters: + http_resp: requests.Response + + Returns: int + """ + data = json.loads(http_resp.content) + count = len(data.get("items")) + + print( + f"[{self.collection_id}]: Fetched ids for page {self.write_page} " + f"at {http_resp.url} with {count} hits" + ) + + return count + + def increment(self, http_resp: requests.Response): + """ + Sets the `next_url` to fetch and increments the page number. + + Parameters: + http_resp: requests.Response + """ + super(YoutubeFetcher, self).increment(http_resp) + + data = json.loads(http_resp.content) + self.next_page_token = data.get("nextPageToken", None) + + self.next_url = self.get_current_url() if self.next_page_token else None + + def json(self) -> str: + """ + Generates JSON for the next page of results. + + Returns: str + """ + current_state = { + "harvest_type": self.harvest_type, + "external_id": self.external_id, + "collection_id": self.collection_id, + "next_url": self.next_url, + "page_token": self.next_page_token, + "write_page": self.write_page, + "per_page": self.per_page + } + + if not self.next_url: + current_state.update({"finished": True}) + + return json.dumps(current_state) diff --git a/metadata_fetcher/settings.py b/metadata_fetcher/settings.py index 7c6c2f729..23e6d93aa 100644 --- a/metadata_fetcher/settings.py +++ b/metadata_fetcher/settings.py @@ -11,6 +11,7 @@ NUXEO_TOKEN = os.environ.get('NUXEO') FLICKR_API_KEY = os.environ.get('FLICKR_API_KEY') +YOUTUBE_API_KEY = os.environ.get('YOUTUBE_API_KEY') DATA_DEST_URL = os.environ.get("FETCHER_DATA_DEST", "file:///tmp") DATA_DEST = { From b23ab711efe1ee5ae1928c1b5c393a1ddf66d84a Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Mon, 17 Jul 2023 12:58:24 -0700 Subject: [PATCH 2/4] Implement Youtube mapper --- .../mappers/youtube/youtube_mapper.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 metadata_mapper/mappers/youtube/youtube_mapper.py diff --git a/metadata_mapper/mappers/youtube/youtube_mapper.py b/metadata_mapper/mappers/youtube/youtube_mapper.py new file mode 100644 index 000000000..781966ef4 --- /dev/null +++ b/metadata_mapper/mappers/youtube/youtube_mapper.py @@ -0,0 +1,56 @@ +import json + +from ..mapper import Vernacular, Record + + +class YoutubeRecord(Record): + def UCLDC_map(self): + return { + "calisphere-id": self.legacy_couch_db_id.split('--')[1], + "isShownAt": self.map_is_shown_at, + "isShownBy": self.map_is_shown_by, + "description": self.map_description, + "subject": self.map_subject, + "title": self.map_title + } + + def map_is_shown_at(self): + video_id = self.source_metadata.get("id") + return f"https://www.youtube.com/watch?v={video_id}" + + def map_is_shown_by(self): + thumbnails = self.source_metadata.get("snippet", {}).get("thumbnails") + for thumb_label in ["standard", "high", "medium", "default"]: + thumb_url = thumbnails.get(thumb_label, {}).get("url") + if thumb_url is not None: + break + else: + thumb_url = None + + return thumb_url + + def map_description(self): + return self.source_metadata.get("snippet", {}).get("description") + + def map_subject(self): + return self.source_metadata.get("snippet", {}).get("tags") + + def map_title(self): + return self.source_metadata.get("snippet", {}).get("title") + + +class YoutubeVernacular(Vernacular): + record_cls = YoutubeRecord + + def parse(self, api_response): + def modify_record(record): + record.update({"calisphere-id": f"{self.collection_id}--" + f"{record.get('id')}"}) + return record + + records = [modify_record(record) for record + in json.loads(api_response).get("items")] + + records = self.get_records(records) + return records + From 51760e061c6146f45828606e24f608c99cd78bd2 Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Tue, 24 Oct 2023 15:43:44 -0700 Subject: [PATCH 3/4] Update YouTube fetcher and mapper * Add type hints * Fix title, description and subject types * Make update based on underlying code changes --- metadata_fetcher/fetchers/youtube_fetcher.py | 7 +++--- .../mappers/youtube/youtube_mapper.py | 23 ++++++++++--------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/metadata_fetcher/fetchers/youtube_fetcher.py b/metadata_fetcher/fetchers/youtube_fetcher.py index 00bb9309f..679769493 100644 --- a/metadata_fetcher/fetchers/youtube_fetcher.py +++ b/metadata_fetcher/fetchers/youtube_fetcher.py @@ -4,7 +4,7 @@ from requests.adapters import HTTPAdapter from requests.adapters import Retry from urllib.parse import urlencode -import settings +from .. import settings class YoutubeFetcher(Fetcher): @@ -104,17 +104,16 @@ def build_fetch_request(self) -> dict[str]: return request - def aggregate_vernacular_content(self, response: requests.Response) -> str: + def aggregate_vernacular_content(self, content: str) -> str: """ If it's a single video, this response is our page content. Otherwise, we need to iterate the returned records and fetch them by id. Parameters: - response: requests.Response + content: str Returns: str """ - content = response.text if self.is_single(): return content diff --git a/metadata_mapper/mappers/youtube/youtube_mapper.py b/metadata_mapper/mappers/youtube/youtube_mapper.py index 781966ef4..f247bbb5f 100644 --- a/metadata_mapper/mappers/youtube/youtube_mapper.py +++ b/metadata_mapper/mappers/youtube/youtube_mapper.py @@ -1,5 +1,5 @@ import json - +from typing import Optional from ..mapper import Vernacular, Record @@ -14,11 +14,11 @@ def UCLDC_map(self): "title": self.map_title } - def map_is_shown_at(self): + def map_is_shown_at(self) -> str: video_id = self.source_metadata.get("id") return f"https://www.youtube.com/watch?v={video_id}" - def map_is_shown_by(self): + def map_is_shown_by(self) -> Optional[str]: thumbnails = self.source_metadata.get("snippet", {}).get("thumbnails") for thumb_label in ["standard", "high", "medium", "default"]: thumb_url = thumbnails.get(thumb_label, {}).get("url") @@ -29,21 +29,22 @@ def map_is_shown_by(self): return thumb_url - def map_description(self): - return self.source_metadata.get("snippet", {}).get("description") + def map_description(self) -> list: + return [self.source_metadata.get("snippet", {}).get("description")] - def map_subject(self): - return self.source_metadata.get("snippet", {}).get("tags") + def map_subject(self) -> list: + return [{"name": v} for v in + self.source_metadata.get("snippet", {}).get("tags", [])] - def map_title(self): - return self.source_metadata.get("snippet", {}).get("title") + def map_title(self) -> list: + return [self.source_metadata.get("snippet", {}).get("title")] class YoutubeVernacular(Vernacular): record_cls = YoutubeRecord - def parse(self, api_response): - def modify_record(record): + def parse(self, api_response: str) -> list: + def modify_record(record: dict) -> dict: record.update({"calisphere-id": f"{self.collection_id}--" f"{record.get('id')}"}) return record From b3f15144cd5158eb5d8b32f37ff3924ee4a73d69 Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Tue, 24 Oct 2023 15:49:52 -0700 Subject: [PATCH 4/4] Reconcile move_date_values with dict-wrapped subjects youtube The subject list must be filled with dicts in the format: {"name": "this is the subject"} to be transformed into a list of strings in the solr doc. The move_date_values encrichment wasn't processing dicts, so the two were out of step with each other." --- metadata_mapper/mappers/mapper.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/metadata_mapper/mappers/mapper.py b/metadata_mapper/mappers/mapper.py index 89a10cb23..202848394 100644 --- a/metadata_mapper/mappers/mapper.py +++ b/metadata_mapper/mappers/mapper.py @@ -134,7 +134,6 @@ def to_UCLDC(self) -> dict[str, Any]: # Mapped value may be a function or lambda self.mapped_data = {k: v() if isinstance(v, Callable) else v for (k, v) in mapped_data.items()} - return self.mapped_data def UCLDC_map(self) -> dict: @@ -494,10 +493,15 @@ def move_date_values(self, prop, dest="temporal"): remove = [] for value in src_values: - if not isinstance(value, str): + if isinstance(value, dict): + if "name" not in value: + continue + value = value.get("name") + elif not isinstance(value, str): continue cleaned_value = re.sub(r"[\(\)\.\?]", "", value) cleaned_value = cleaned_value.strip() + for pattern in constants.move_date_value_reg_search: matches = re.compile(pattern, re.I).findall(cleaned_value) if (len(matches) == 1 and