Skip to content

Commit

Permalink
Add seeding and clustering tasks
Browse files Browse the repository at this point in the history
  • Loading branch information
egemenzeytinci committed Mar 18, 2021
1 parent e26a8a3 commit 5b9fb49
Show file tree
Hide file tree
Showing 25 changed files with 738 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
static/* linguist-vendored
template/* linguist-vendored
notebook/* linguist-vendored
16 changes: 16 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# PyCharm
*.iml
.idea/

# VSCode
.vscode/

# Python
*.pyc
__pycache__

# MacOS
.DS_Store

# Configuration directory
/config
39 changes: 39 additions & 0 deletions db/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from util.config import config


def get_engine():
"""
Get SQLAlchemy engine by using parameters
:return: SQLAlchemy engine
:rtype: sqlalchemy.engine.Engine
"""
host = config.db.host
port = config.db.port
user = config.db.user
password = config.db.password
db = config.db.db

u = f'postgresql://{user}:{password}@{host}:{port}/{db}'

return create_engine(u)


def get_session():
"""
Returns SQLAlchemy session
:return: session
:rtype: sqlalchemy.orm.session.Session
"""
return SessionItem()


# get default engine from db configuration
engine = get_engine()

Base = declarative_base()
SessionItem = sessionmaker(bind=engine, autocommit=False)
9 changes: 9 additions & 0 deletions db/factory/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from .basic import BasicFactory
from .rating import RatingFactory
from .score import ScoreFactory

__all__ = [
'BasicFactory',
'RatingFactory',
'ScoreFactory',
]
34 changes: 34 additions & 0 deletions db/factory/basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from db import get_session
from db.model import Basic


class BasicFactory:
def get_by_ids(self, ids):
"""
Get basic objects by page
:param list ids: title ids
:return: basic objects
:rtype: list[Basic]
"""
session = get_session()

try:
return session.query(Basic).filter(Basic.title_id.in_(ids)).all()
finally:
session.close()

def save_all(self, basics):
"""
Save all objects
:param list[Basic] basic: basic objects
"""
session = get_session()

try:
for basic in basics:
session.merge(basic)
session.commit()
finally:
session.close()
25 changes: 25 additions & 0 deletions db/factory/rating.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from db import get_session
from db.model import Rating


class RatingFactory:
def get_by_offset(self, limit=500, offset=1):
"""
Get rating objects by page
:param int limit: limit
:param int offset: offset
:return: rating objects
:rtype: list[Rating]
"""
session = get_session()

try:
return session \
.query(Rating) \
.order_by(Rating.title_id) \
.limit(limit) \
.offset(offset * limit) \
.all()
finally:
session.close()
34 changes: 34 additions & 0 deletions db/factory/score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from db import get_session
from db.model import Basic, Rating


class ScoreFactory:
def get_all(self):
"""
Get features about the scoring
:return: list of ratings and basics features
:rtype: list
"""
session = get_session()

columns = [
Basic.title_id,
Basic.start_year,
Basic.title_type,
Basic.runtime,
Rating.num_votes,
]

filters = [
Basic.is_crawled.is_(True),
Basic.runtime.isnot(None),
]

try:
return session.query(*columns) \
.join(Rating, Rating.title_id == Basic.title_id) \
.filter(*filters) \
.all()
finally:
session.close()
11 changes: 11 additions & 0 deletions db/model/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from .basic import Basic
from .basic import TitleType
from .episode import Episode
from .rating import Rating

__all__ = [
'Basic',
'Episode',
'Rating',
'TitleType',
]
54 changes: 54 additions & 0 deletions db/model/basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from attrdict import AttrDict
from db import Base
from enum import Enum
from sqlalchemy import Boolean, Column, Integer, SmallInteger, String, Text
from sqlalchemy.types import ARRAY as Array
import re


class TitleType(Enum):
MOVIE = 1
TV_SERIES = 2
TV_MINI_SERIES = 3
TV_MOVIE = 4
SHORT = 5
TV_SHORT = 6
TV_SPECIAL = 7

@staticmethod
def get(tt):
key = re.sub(r'(?<!^)(?=[A-Z])', '_', tt).upper()
return TitleType[key].value


class Basic(Base):
__tablename__ = 'basics'

title_id = Column('title_id', String(20), primary_key=True)
title_type = Column('title_type', SmallInteger, nullable=False, index=True)
primary_title = Column('primary_title', String(500))
original_title = Column('original_title', String(500))
is_adult = Column('is_adult', Boolean)
start_year = Column('start_year', Integer)
end_year = Column('end_year', Integer)
runtime = Column('runtime', Integer)
genres = Column('genres', Array(String))
description = Column(Text)
image_url = Column(Text)
cluster = Column('cluster', Integer)
is_crawled = Column('is_crawled', Boolean, default=False, index=True)

@staticmethod
def mapping():
mapping = AttrDict()
mapping.tconst = 'title_id'
mapping.titleType = 'title_type'
mapping.primaryTitle = 'primary_title'
mapping.originalTitle = 'original_title'
mapping.isAdult = 'is_adult'
mapping.startYear = 'start_year'
mapping.endYear = 'end_year'
mapping.runtimeMinutes = 'runtime'
mapping.genres = 'genres'

return mapping
24 changes: 24 additions & 0 deletions db/model/episode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from attrdict import AttrDict
from db import Base
from sqlalchemy import cast, Column, ForeignKey, Integer, String
from sqlalchemy.orm import column_property


class Episode(Base):
__tablename__ = 'episodes'

title_id = Column('title_id', String(20), primary_key=True)
parent_id = Column('parent_id', String(20), ForeignKey('basics.title_id'))
season_number = Column('season_number', Integer)
episode_number = Column('episode_number', Integer)
info = column_property('S' + cast(season_number, String) + 'E' + cast(episode_number, String))

@staticmethod
def mapping():
mapping = AttrDict()
mapping.tconst = 'title_id'
mapping.parentTconst = 'parent_id'
mapping.seasonNumber = 'season_number'
mapping.episodeNumber = 'episode_number'

return mapping
20 changes: 20 additions & 0 deletions db/model/rating.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from attrdict import AttrDict
from db import Base
from sqlalchemy import Column, Float, Integer, String


class Rating(Base):
__tablename__ = 'ratings'

title_id = Column('title_id', String(20), primary_key=True)
average_rating = Column('average_rating', Float)
num_votes = Column('num_votes', Integer)

@staticmethod
def mapping():
mapping = AttrDict()
mapping.tconst = 'title_id'
mapping.averageRating = 'average_rating'
mapping.numVotes = 'num_votes'

return mapping
5 changes: 5 additions & 0 deletions db/seed/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .seed import Seed

__all__ = [
'Seed',
]
Loading

0 comments on commit 5b9fb49

Please sign in to comment.