Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions env.example
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ export RIKOLTI_DATA=file:///usr/local/airflow/rikolti_data
export NUXEO= # ask for a key - required to run the NuxeoFetcher
export FLICKR_API_KEY= # ask for a key - required to run the FlickrFetcher
export CALISPHERE_ETL_TOKEN= # ask for token - required to run Calisphere Solr Fetcher
export YOUTUBE_API_KEY="" # ask for a key - required to run the YouTube Fetcher

# metadata_mapper
export SKIP_UNDEFINED_ENRICHMENTS=True
Expand Down
63 changes: 63 additions & 0 deletions metadata_fetcher/fetchers/youtube_fetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import json
import logging

from .. import settings
from .Fetcher import Fetcher

class YoutubeFetcher(Fetcher):

def __init__(self, params):
super(YoutubeFetcher, self).__init__(params)

self.harvest_data = params.get("harvest_data", {})
self.url = self.harvest_data.get("url")
self.next_page_token = params.get("next_page_token")

def build_fetch_request(self):
"""
We expect to receive a url like these, with the playlistid or id parameter supplied:
- https://www.googleapis.com/youtube/v3/playlistItems?playlistId={id}
- https://www.googleapis.com/youtube/v3/videos?id={id}
"""

url = (
f"{self.url.rstrip('/')}"
f"&key={settings.YOUTUBE_API_KEY}"
f"&part=snippet"
f"&maxResults=50"
)

if self.next_page_token:
url += f"&pageToken={self.next_page_token}"

return {"url": url}

def check_page(self, http_resp) -> int:
data = json.loads(http_resp.content)
items = data.get("items")

if len(items) > 0:
logging.debug(
f"{self.collection_id}, fetched page {self.write_page} - "
f"{len(items)} hits,-,-,-,-,-"
)

return len(items)

def increment(self, http_resp):
self.write_page = self.write_page + 1

data = http_resp.json()
self.next_page_token = data.get("nextPageToken")

def json(self) -> str:
if not self.next_page_token:
return json.dumps({"finished": True})
else:
return json.dumps({
"harvest_type": self.harvest_type,
"collection_id": self.collection_id,
"harvest_data": self.harvest_data,
"write_page": self.write_page,
"next_page_token": self.next_page_token
})
1 change: 1 addition & 0 deletions metadata_fetcher/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
NUXEO_TOKEN = os.environ.get('NUXEO')
FLICKR_API_KEY = os.environ.get('FLICKR_API_KEY')
CALISPHERE_ETL_TOKEN = os.environ.get('CALISPHERE_ETL_TOKEN')
YOUTUBE_API_KEY = os.environ.get('YOUTUBE_API_KEY')

for key, value in os.environ.items():
logger.debug(f"{key}={value}")
47 changes: 47 additions & 0 deletions metadata_mapper/mappers/youtube/youtube_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import json

from ..mapper import Record, Vernacular

class YoutubeRecord(Record):
def UCLDC_map(self):
return {
"calisphere-id": self.legacy_couch_db_id.split('--')[1],
"isShownAt": self.map_is_shown_at,
"isShownBy": self.map_is_shown_by,
"description": self.map_description,
"subject": self.map_subject,
"title": self.map_title
}

def map_is_shown_at(self):
if self.source_metadata.get('id'):
return f"https://www.youtube.com/watch?v={self.source_metadata.get('id')}"

def map_is_shown_by(self):
thumbnails = self.source_metadata.get("snippet",{}).get("thumbnails",{})
return thumbnails.get("standard", {}).get("url")

def map_description(self):
return self.source_metadata.get("snippet", {}).get("description")

def map_subject(self):
tags = self.source_metadata.get("snippet", {}).get("tags")

return [{"name": tag} for tag in tags]

def map_title(self):
return self.source_metadata.get("snippet", {}).get("title")


class YoutubeVernacular(Vernacular):
record_cls = YoutubeRecord

def parse(self, api_response):
def modify_record(record):
record.update({"calisphere-id": f"{self.collection_id}--"
f"{record.get('id')}"})
return record

records = json.loads(api_response).get("items")
records = [modify_record(record) for record in json.loads(api_response).get("items")]
return self.get_records(records)