modified code base

umang-clairvoyant · umang-clairvoyant · commit 95a5450254f3 · 2023-06-01T11:18:17.000+05:30
diff --git a/.pylintrc b/.pylintrc
@@ -59,7 +59,7 @@ confidence=
 #
 # Kubeflow disables string-interpolation because we are starting to use f
 # style strings
-disable=bad-indentation,unspecified-encoding,missing-class-docstring,missing-module-docstring,no-name-in-module,dangerous-default-value,broad-except,import-outside-toplevel,bare-except
+disable=bad-indentation,unspecified-encoding,missing-class-docstring,missing-module-docstring,no-name-in-module,dangerous-default-value,broad-except,import-outside-toplevel,bare-except,invalid-name
 
 
 [REPORTS]
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,9 @@ openpyxl = "^3.1.2"
 pypika = "^0.48.9"
 
 
+[tool.poetry.group.dev.dependencies]
+ipykernel = "^6.23.1"
+
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
diff --git a/src/app/logger.py b/src/app/logger.py
@@ -6,14 +6,15 @@
 def setup_logging(log_dir: str):
     """Load logging configuration"""
 
-    log_file_name = log_dir + '/' + 'minimal-app-' + datetime.now().strftime("%Y-%m-%d") + '.log'
+    log_file_name = log_dir + '/' + 'minimal-app-' + \
+        datetime.now().strftime("%Y-%m-%d") + '.log'
 
     loging_config = {
         'version': 1,
         'disable_existing_loggers': False,
         'loggers': {
             'root': {
-                'level': 'INFO',
+                'level': 'DEBUG',
                 'handlers': ['debug_console_handler', 'info_rotating_file_handler'],
             },
             'src': {
diff --git a/src/app/services/query_builder/__init__.py b/src/app/services/query_builder/__init__.py
@@ -1,2 +1,2 @@
-from .dialect import RedshiftDialect, RedshiftQuery
+from .dialect import RedshiftDialect
 from .query_builder import QueryBuilder
diff --git a/src/app/services/query_builder/dialect.py b/src/app/services/query_builder/dialect.py
@@ -1,84 +1,284 @@
+import functools
 import logging
-from typing import Any, Dict, List
+import textwrap
+from collections import defaultdict
+from typing import (Any, Callable, Dict, Iterable, List, Mapping, NamedTuple,
+                    Optional, Sequence, Tuple, TypeVar, Union)
 
 from pydantic.dataclasses import dataclass
-from pypika import Query, Schema, Table
-from pypika.enums import Dialects, JoinType
-from pypika.queries import QueryBuilder
-from pypika.utils import builder
 
-from app.services.query_builder.join_queries import JoinQuery
+from app.etl_exceptions import AutoETLException
 
 logger = logging.getLogger(__name__)
 
 
-class RedshiftQuery(Query):
-    """
-    Query class for AWS Redshift
-    """
+_T = TypeVar('_T', bound='QueryValue')
+
+
+_Q = TypeVar('_Q', bound='BaseQuery')
+_QArg = Union[str, Tuple[str, ...]]
+
+
+class QueryValue(NamedTuple):
+    value: str
+    alias: str = ''
+    on_condn: str = ''
+    keyword: str = ''
+    is_subquery: bool = False
 
     @classmethod
-    def _builder(cls, **kwargs: Any) -> "RedshiftQueryBuilder":
-        return RedshiftQueryBuilder(**kwargs)
+    def from_arg(cls, arg: _QArg, **kwargs: Any) -> 'QueryValue':
+        """method to set the parameter for the QueryValue
+
+        Args:
+            arg (_QArg): _description_
+
+        Raises:
+            ValueError
+
+        Returns:
+            class object 'QueryValue'
+        """
+        if isinstance(arg, str):
+            alias, value, on_condn = '', arg, ''
+        elif len(arg) == 3 and 'JOIN' in kwargs['keyword']:
+            alias, value, on_condn = arg
+        elif len(arg) == 2:
+            alias, value = arg
+            on_condn = ''
+        else:  # pragma: no cover
+            raise ValueError(f"invalid arg: {arg!r}")
+        return cls(_clean_up(value), _clean_up(alias), _clean_up(on_condn), **kwargs)
+
+
+class _FlagList(List[_T]):
+    flag: str = ''
+
+
+def _clean_up(thing: str) -> str:
+    return textwrap.dedent(thing.rstrip()).strip()
+
+
+class BaseQuery:
+
+    keywords = [
+        'WITH',
+        'SELECT',
+        'FROM',
+        'JOIN',
+        'WHERE',
+        'GROUP BY',
+        'HAVING',
+        'ORDER BY',
+        'LIMIT',
+    ]
+
+    separators: Mapping[str, str] = dict(WHERE='AND', HAVING='AND')
+    default_separator = ','
+
+    formats: Tuple[Mapping[str, str], ...] = (
+        defaultdict(lambda: '{value}'),
+        defaultdict(lambda: '{value} AS {alias}', WITH='{alias} AS {value}'),
+    )
+
+    subquery_keywords = {'WITH'}
+    fake_keywords = dict(JOIN='FROM')
+    flag_keywords = dict(SELECT={'DISTINCT', 'ALL'})
+
+    def __init__(
+        self,
+        data: Optional[Mapping[str, Iterable[_QArg]]] = None,
+        separators: Optional[Mapping[str, str]] = None,
+    ) -> None:
+        """
+        """
+        self.data: Mapping[str, _FlagList[QueryValue]] = {}
+        if data is None:
+            data = dict.fromkeys(self.keywords, ())
+        for keyword, args in data.items():
+            self.data[keyword] = _FlagList()
+            self.add(keyword, *args)
+
+        if separators is not None:
+            self.separators = separators
+
+    def add(self: _Q, keyword: str, *args: _QArg) -> _Q:
+        """method to add params to the query object
+
+        Args:
+            self (_Q): current object of Basequery
+            keyword (str): keyword to be added to the query object
 
+        Raises:
+            ValueError
 
-class RedshiftQueryBuilder(QueryBuilder):
-    QUERY_CLS = RedshiftQuery
+        Returns:
+            _Q: Basequery
+        """
+        keyword, fake_keyword = self._resolve_fakes(keyword)
+        keyword, flag = self._resolve_flags(keyword)
+        target = self.data[keyword]
+
+        if flag:
+            if target.flag:  # pragma: no cover
+                raise ValueError(f"{keyword} already has flag: {flag!r}")
+            target.flag = flag
+
+        kwargs: Dict[str, Any] = {}
+        if fake_keyword:
+            kwargs.update(keyword=fake_keyword)
+        if keyword in self.subquery_keywords:
+            kwargs.update(is_subquery=True)
+
+        for arg in args:
+            target.append(QueryValue.from_arg(arg, **kwargs))
+
+        return self
+
+    def _resolve_fakes(self, keyword: str) -> Tuple[str, str]:
+        for part, real in self.fake_keywords.items():
+            if part in keyword:
+                return real, keyword
+        return keyword, ''
+
+    def _resolve_flags(self, keyword: str) -> Tuple[str, str]:
+        prefix, _, flag = keyword.partition(' ')
+        if prefix in self.flag_keywords:
+            if flag and flag not in self.flag_keywords[prefix]:
+                raise ValueError(f"invalid flag for {prefix}: {flag!r}")
+            return prefix, flag
+        return keyword, ''
+
+    def __getattr__(self: _Q, name: str) -> Callable[..., _Q]:
+        # conveniently, avoids shadowing dunder methods (e.g. __deepcopy__)
+        if not name.isupper():
+            return getattr(super(), name)  # type: ignore
+        return functools.partial(self.add, name.replace('_', ' '))
+
+    def __str__(self) -> str:
+        return ''.join(self._lines())
+
+    def _lines(self) -> Iterable[str]:
+        for keyword, things in self.data.items():
+            if not things:
+                continue
+
+            if things.flag:
+                yield f'{keyword} {things.flag}\n'
+            else:
+                yield f'{keyword}\n'
 
-    def __init__(self, **kwargs: Any) -> None:
-        super().__init__(dialect=Dialects.REDSHIFT, **kwargs)
+            grouped: Tuple[List[QueryValue], ...] = ([], [])
+            for thing in things:
+                grouped[bool(thing.keyword)].append(thing)
+            for group in grouped:
+                yield from self._lines_keyword(keyword, group)
 
-    @builder
-    def join(
-        self, item: Table, how: JoinType = JoinType.inner
-    ) -> "JoinQuery":
-        if isinstance(item, Table):
-            return JoinQuery(self, item, how, label="table")
+    def _lines_keyword(self, keyword: str, things: Sequence[QueryValue]) -> Iterable[str]:
+        for i, thing in enumerate(things):
+            last = i + 1 == len(things)
 
-        raise ValueError(f"Cannot join on type {type(item)}")
+            if thing.keyword:
+                yield thing.keyword + '\n'
 
-    def inner_join(self, item: Table) -> "JoinQuery":
-        return self.join(item, JoinType.inner)
+            _format = self.formats[bool(thing.alias)][keyword]
+            value = thing.value
 
-    def left_join(self, item: Table) -> "JoinQuery":
-        return self.join(item, JoinType.left)
+            if thing.is_subquery:
+                value = f'(\n{textwrap.indent(text=value, prefix="    ")}\n)'
+
+            yield textwrap.indent(text=_format.format(value=value, alias=thing.alias), prefix='   ')
+
+            if thing.on_condn:
+                yield '\n ON '+thing.on_condn
+
+            if not last and not thing.keyword:
+                try:
+                    yield ' ' + self.separators[keyword]
+                except KeyError:
+                    yield self.default_separator
+
+            yield '\n'
 
 
 @dataclass()
 class RedshiftDialect:
     target_table_conf: List[Dict]
     joins_and_filters_conf: Dict[str, Dict]
+    select_sources: List[Dict]
 
     def get_sql(self) -> None:
         """Method to trrigger Redshift query builder
         """
         logger.info('building Redshift query from the mappings file')
 
-        _query = RedshiftQuery()
+        _query = BaseQuery()
+        _query = self.get_select(_query, self.select_sources)
+        _query = self.get_join(_query)
+        print(str(_query))
+
+    def get_select(self, _query: BaseQuery, select_sources: List[Dict]) -> BaseQuery:
+        """method to generate the select sql
+
+        Args:
+            _query (BaseQuery)
+            select_sources (List[Dict]): mapping file dictionary
 
+        Returns:
+            BaseQuery
+        """
+        try:
+            for _select in select_sources:
+                _query = _query.SELECT(
+                    (_select['column_alias'], _select['transformation']))
+
+            return _query
+        except Exception as excep:
+            logger.error("Error while generating SELECT sql")
+            raise AutoETLException(
+                "Error while generating SELECT sql", excep.args)
+
+    def get_join(self, _query: BaseQuery) -> BaseQuery:
+        """method to generate the join sql
+
+        Args:
+            _query (BaseQuery)
+
+        Returns:
+            BaseQuery
+        """
         for _index in self.joins_and_filters_conf:
 
             _map = self.joins_and_filters_conf[_index]
 
             if int(_index) == 0:
-                schema1, schema2 = Schema(_map['driving_table'].split(
-                    '.')[0]), Schema(_map['reference_table'].split('.')[0])
 
-                table1, table2 = Table(_map['driving_table'].split('.')[1],
-                                       schema=schema1, alias=_map['driving_table_alias']), \
-                    Table(_map['reference_table'].split('.')[1],
-                          schema=schema2, alias=_map['reference_table_alias'])
+                _query = _query.FROM(
+                    (_map['driving_table_alias'], _map['driving_table']))
 
-                _query = _query.from_(table1).inner_join(
-                    table2).on(_map['join_condition'])
+            if _map['reference_subquery']:
+                ref_table = '('+_map['reference_subquery']+')'
             else:
-                schema2 = Schema(_map['reference_table'].split(
-                    '.')[0])
+                ref_table = _map['reference_table']
 
-                table2 = Table(_map['reference_table'].split('.')[1],
-                               schema=schema2, alias=_map['driving_table_alias'])
+            match _map['join_type']:
+                case "left join":
+                    _query = _query.LEFT_JOIN(
+                        (_map['reference_table_alias'], ref_table, _map['join_condition']))
+                case "right join":
+                    _query = _query.RIGHT_JOIN(
+                        (_map['reference_table_alias'], ref_table, _map['join_condition']))
+                case "inner join":
+                    _query = _query.INNER_JOIN(
+                        (_map['reference_table_alias'], ref_table, _map['join_condition']))
+                case "full outer join":
+                    _query = _query.FULL_OUTER_JOIN(
+                        (_map['reference_table_alias'], ref_table, _map['join_condition']))
+                case "cross join":
+                    _query = _query.CROSS_JOIN(
+                        (_map['reference_table_alias'], ref_table, _map['join_condition']))
 
-                _query = _query.left_join(table2).on(
-                    _map['join_condition'])
+            if _map['filter_condition']:
+                _query = _query.WHERE(_map['filter_condition'])
 
-        print(_query.select('*'))
+        return _query
diff --git a/src/app/services/query_builder/query_builder.py b/src/app/services/query_builder/query_builder.py
@@ -1,6 +1,7 @@
 import dataclasses
 import logging
 import os
+import sys
 
 from pydantic.dataclasses import dataclass
 
@@ -47,11 +48,13 @@ def run(self) -> None:
         target_table_json = excel_to_json(meta_xls, 'target_table', 'records')
         joins_and_filters = excel_to_json(
             meta_xls, 'joins_and_filters', 'index')
+        select_sources = excel_to_json(meta_xls, 'select_sources', 'records')
 
         validate_joins_mapping(joins_and_filters)
         match _config['target']:
             case 'redshift':
-                RedshiftDialect(target_table_json, joins_and_filters).get_sql()
+                RedshiftDialect(target_table_json,
+                                joins_and_filters, select_sources).get_sql()
 
             case _:
                 logger.error(
diff --git a/src/app/utils.py b/src/app/utils.py
diff --git a/tests/Auto_ETL_Metadata_Mapping_V1.xlsx b/tests/Auto_ETL_Metadata_Mapping_V1.xlsx

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ confidence=`
`59`	`59`	`#`
`60`	`60`	`# Kubeflow disables string-interpolation because we are starting to use f`
`61`	`61`	`# style strings`
`62`		`-disable=bad-indentation,unspecified-encoding,missing-class-docstring,missing-module-docstring,no-name-in-module,dangerous-default-value,broad-except,import-outside-toplevel,bare-except`
	`62`	`+disable=bad-indentation,unspecified-encoding,missing-class-docstring,missing-module-docstring,no-name-in-module,dangerous-default-value,broad-except,import-outside-toplevel,bare-except,invalid-name`
`63`	`63`
`64`	`64`
`65`	`65`	`[REPORTS]`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-from .dialect import RedshiftDialect, RedshiftQuery`
	`1`	`+from .dialect import RedshiftDialect`
`2`	`2`	`from .query_builder import QueryBuilder`