Module csv_to_json
Build an object -> upstream dependencies JSON object from CSV content.
Parameters
- [
str
]: Path to the CSV file(s).
Returns
- [
str
]: JSON-formatted, nested list of object and upstream dependencies (objects that are depended on).
Usage
$ python script.py <CSV FILE> [<CSV FILE> [...]]
Example
$ python script.py dependencies.csv
$ python script.py file1.csv file2.csv file3.csv
Functions
Functions
csv_to_json.to_json
to_json(content: str, objects: dict[str, list[str]]) -> dict[str, list[str]]:
Convert the CSV content to JSON.
Parameters
content
[str
]: The CSV content to parse and convert.objects
[dict[str, list[str]]
]: Dictionary of objects already parsed.
Returns
- [
dict[str, list[str]]
]: Updated dictionary of objects and upstream dependencies.
Notes
Expects a two columns dataset, each line embedding a object1,object2
pair; the second object is expected to be depended upon.source
def to_json(content: str, objects: dict[str, list[str]]) -> dict[str, list[str]]:
r"""Convert the CSV content to JSON.
Parameters
----------
content : str
The CSV content to parse and convert.
objects : dict[str, list[str]]
Dictionary of objects already parsed.
Returns
-------
: dict[str, list[str]]
Updated dictionary of objects and upstream dependencies.
Notes
-----
Expects a two columns dataset, each line embedding a `object1,object2` pair; the
second object is expected to be depended upon.
"""
for r in content.split("\n"):
if len(r.strip()):
c, p = r.split(",")
# list dependencies as parent -> (list of) child(ren)
if p in objects:
objects[p].append(c)
else:
objects[p] = [c]
return objects
Module filter_json
Fetch all objects related to another one, regardless of the depth.
Parameters
- [
str
]: Name of the object to filter for. - [
str
]: Path to the JSON file(s).
Returns
- [
str
]: JSON-formatted, nested list of object and upstream dependencies (objects that are depended on).
Usage
$ python script.py <OBJECT NAME> <JSON FILE> [<JSON FILE> [...]]
Example
$ python script.py fact_thing dependencies.json
$ python script.py dim_whatever file1.json file2.json file3.json
Functions
filter_json()
: Fetch all objects related to a single object, regardless of the depth.
Functions
filter_json.filter_json
filter_json(
name: str,
objects: dict[str, list[str]],
_objects: dict[str, list[str]] | None = None,
) -> dict[str, list[str]]:
Fetch all objects related to a single object, regardless of the depth.
Parameters
name
[str
]: Name of the object to filter for.objects
[dict[str, list[str]]
]: Dictionary of objects and upstream dependencies._objects
[dict[str, list[str]]
]: Dictionary of objects already parsed.
Returns
- [
dict[str, list[str]]
]: Filtered list of upstream and downstream dependencies.
source
def filter_json(
name: str,
objects: dict[str, list[str]],
_objects: dict[str, list[str]] | None = None,
) -> dict[str, list[str]]:
r"""Fetch all objects related to a single object, regardless of the depth.
Parameters
----------
name : str
Name of the object to filter for.
objects : dict[str, list[str]]
Dictionary of objects and upstream dependencies.
_objects : dict[str, list[str]]
Dictionary of objects already parsed.
Returns
-------
: dict[str, list[str]]
Filtered list of upstream and downstream dependencies.
"""
_objects = {} if _objects is None else _objects
included: list[str] = []
# filter until the size of the list does not change anymore
maxn = 1e99
while len(included) != maxn:
maxn = len(included)
# run through all the nodes
# expensive if a lot of objects are involved
for n, deps in objects.items():
if n == name or n in included:
for d in deps:
if d not in included:
included.append(d)
# all objects along the lineage
for i in included:
if i in objects and i not in _objects:
_objects[i] = objects[i]
return _objects
Convert a child -> list of parents JSON to Mermaid
syntax.
Parameters
- [
str
]: Path to the JSON file(s).
Returns
Usage
$ python script.py <JSON FILE> [<JSON FILE> [...]]
Example
$ python script.py dependencies.json
$ python script.py file1.json file2.json file3.json
Functions
to_dot()
: Convert the JSON content to DOT
syntax.to_mmd()
: Convert the JSON content to Mermaid
syntax.
Functions
to_dot(objects: dict[str, list[str]]) -> str:
Convert the JSON content to DOT
syntax.
Parameters
objects
[dict[str, list[str]]
]: Dictionary of objects and upstream dependencie.
Returns
source
def to_dot(objects: dict[str, list[str]]) -> str:
r"""Convert the JSON content to `DOT` syntax.
Parameters
----------
objects : dict[str, list[str]]
Dictionary of objects and upstream dependencie.
Returns
-------
: str
`DOT` diagram.
"""
d = ""
# build the list of unique nodes
nodes: dict[str, int] = {}
i = 0
for n1, deps in objects.items():
if n1 not in nodes:
i += 1
nodes[n1] = i
for n2 in deps:
if n2 not in nodes:
i += 1
nodes[n2] = i
# nodes
d += " // nodes\n"
for i, n in enumerate(nodes):
d += f' node{i} [label"{n}"]\n'
# links
d += " // links\n"
for n1 in objects:
for n2 in objects:
d += f" node{nodes[n1]} -- node{nodes[n2]}\n"
d += "}"
return f"graph {{\n{d}\n}}"
to_mmd(objects: dict[str, list[str]]) -> str:
Convert the JSON content to Mermaid
syntax.
Parameters
objects
[dict[str, list[str]]
]: Dictionary of objects and upstream dependencie.
Returns
source
def to_mmd(objects: dict[str, list[str]]) -> str:
r"""Convert the JSON content to `Mermaid` syntax.
Parameters
----------
objects : dict[str, list[str]]
Dictionary of objects and upstream dependencie.
Returns
-------
: str
`Mermaid` diagram.
"""
# always top-bottom, manually change it if you want
d = "graph TB\n"
# build the list of unique nodes
nodes: dict[str, int] = {}
i = 0
for n1, deps in objects.items():
if n1 not in nodes:
i += 1
nodes[n1] = i
for n2 in deps:
if n2 not in nodes:
i += 1
nodes[n2] = i
# nodes
d += " %% nodes\n"
for i, n in enumerate(nodes):
d += f" node{i}({n})\n"
# links
d += " %% links\n"
for n1 in objects:
for n2 in objects:
d += f" node{nodes[n1]} --- node{nodes[n2]}\n"
return d
Module sql_to_json
Extract upstream dependencies from SQL files, each containing a single query.
Parameters
- [
str
]: Path to the SQL script(s), each containing a singled out SQL query.
Returns
- [
str
]: JSON-formatted, nested list of objects and associated list of upstream dependencies. One can see this object as child -> list of parents.
Usage
$ python script.py <SQL FILE> [<SQL FILE> [...]]
$ python script.py <SQL FILE> [<SQL FILE> [...]] --pretty
Example
$ python script.py view.sql --pretty
$ python script.py fact_*.sql dim_*.sql
Note
- Only a few SQL statements amongst the gazillions ways to write them are supported; feel free to drop a message with a new one to test.
- This is based on queries running on
Redshift
, no guarantees this would work on any other syntax (but Redshift
is largely based on PostgreSQL
, there's hope). - This little stunt is still in alpha, and a lot more testing is required!
Functions
Functions
sql_to_json.clean_query
clean_query(query: str) -> str:
Deep-cleaning of a SQL query via sqlparse
and regular expressions.
Parameters
query
[str
]: The SQL query.
Returns
Notes
sqlparse
tries to set all supported SQL statements to uppercase.- Further cleaning is done via the following regular expressions:
"/\*.*\*/"
-> ""
: remove remaining multiline comments;"--.*"
-> ""
: remove remaining inline comments;"([(,)])"
-> " , "
: single spaces around function parameters;"([A-Za-z0-9_]+)\s*\.\s*([A-Za-z0-9_]+)"
-> "[...].[...]"
: remove spaces around object descriptors;"(.*)\s*[<=>]+\s*(.*)"
-> "[...] = [...]"
: single spaces around equal, greater or less than signs (or combinations thereof);"(.*)\s*\|\|\s*(.*)
-> "[...] || [...]"
: single spaces around concatenation operators;"(.*)\s*::\s*(.*)"
-> "[...]::[...]"
: remove spaces around datatyping operators;"[\s]+"
-> " "
: replace multiple spaces by single spaces;";$"
-> ""
: remove final semicolumn (;
).
source
def clean_query(query: str) -> str:
r"""Deep-cleaning of a SQL query via
[`sqlparse`](https://github.com/andialbrecht/sqlparse) and regular expressions.
Parameters
----------
query : str
The SQL query.
Returns
-------
: str
Cleaned up query.
Notes
-----
1. `sqlparse` tries to set all supported SQL statements to uppercase.
2. Further cleaning is done via the following regular expressions:
* `"/\*.*\*/"` -> `""`: remove remaining multiline comments;
* `"--.*"` -> `""`: remove remaining inline comments;
* `"([(,)])"` -> `" , "`: single spaces around function parameters;
* `"([A-Za-z0-9_]+)\s*\.\s*([A-Za-z0-9_]+)"` -> `"[...].[...]"`: remove spaces
around object descriptors;
* `"(.*)\s*[<=>]+\s*(.*)"` -> `"[...] = [...]"`: single spaces around equal,
greater or less than signs (or combinations thereof);
* `"(.*)\s*\|\|\s*(.*)` -> `"[...] || [...]"`: single spaces around
concatenation operators;
* `"(.*)\s*::\s*(.*)"` -> `"[...]::[...]"`: remove spaces around datatyping
operators;
* `"[\s]+"` -> `" "`: replace multiple spaces by single spaces;
* `";$"` -> `""`: remove final semicolumn (`;`).
"""
# good effort, but does not know some functions/keywords
q = sqlparse.format(query, keyword_case="lower", strip_comments=True)
# regular cleaning
q = re.sub(r"/\*.*\*/", "", q, flags=re.DOTALL)
q = re.sub("--.*", "", q)
q = re.sub("([(,)])", r" \1 ", q)
q = re.sub(r"([A-Za-z0-9_]+)\s*\.\s*([A-Za-z0-9_]+)", r"\1.\2", q)
q = re.sub(r"(.*)\s*[<=>]+\s*(.*)", r"\1 = \2", q)
q = re.sub(r"(.*)\s*\|\|\s*(.*)", r"\1 || \2", q)
q = re.sub(r"(.*)\s*::\s*(.*)", r"\1::\2", q)
q = re.sub(r"[\s]+", " ", q)
q = re.sub(";$", "", q)
q = q.strip()
return q
sql_to_json.clean_functions
clean_functions(query: str) -> str:
Escape FROM
operators in SQL functions.
Parameters
query
[str
]: The SQL query.
Returns
Notes
Currently testing for the following regular expression:
"(\(\s+['\"].+?['\"]\s+)FROM(\s+\S+?\s+\))"
: match function parameters including quoted keywords and the FROM
keyword,"(\(\s+\S+?\s+)FROM(\s+\S+?\s+\))"
: match function parameters including regular unquoted keywords and the FROM
keyword,"(\(\s+\S+?\s+)FROM(\s+\S+?\s+\()"
: ``.
The FROM
from the matched pattern will be replaced by %FROM%
not to be matched by the follow up processing.source
def clean_functions(query: str) -> str:
r"""Escape `FROM` operators in SQL functions.
Parameters
----------
query : str
The SQL query.
Returns
-------
: str
Cleaned up query.
Notes
-----
Currently testing for the following regular expression:
* `"(\(\s+['\"].+?['\"]\s+)FROM(\s+\S+?\s+\))"`: match function parameters including
quoted keywords and the `FROM` keyword,
* `"(\(\s+\S+?\s+)FROM(\s+\S+?\s+\))"`: match function parameters including regular
unquoted keywords and the `FROM` keyword,
* `"(\(\s+\S+?\s+)FROM(\s+\S+?\s+\()"`: ``.
The `FROM` from the matched pattern will be replaced by `%FROM%` not to be matched
by the follow up processing.
"""
# clean up the query until its length does not change anymore
maxn = 1e99
while len(query) != maxn:
maxn = len(query)
# test for various cases; kind of expect a query that went through the
# clean_query() function first as it does not account for multiline text
for r in (
r"(\(\s+['\"].+?['\"]\s+)from(\s+\S+?\s+\))",
r"(\(\s+\S+?\s+)from(\s+\S+?\s+\))",
r"(\(\s+\S+?\s+)from(\s+\S+?\s+\()", # reversed bracket
):
for m in re.finditer(r, query):
query = query.replace(m.group(0), f"{m.group(1)}%FROM%{m.group(2)}")
return query
sql_to_json.split_query
split_query(query: str) -> dict[str, str]:
Split a query in its subqueries, if any.
Parameters
query
[str
]: The DDL to parse.
Returns
- [
dict[str, str]
]: Dictionary of [sub]queries and associated DDL, split in parts if the union
keyword is found.
Notes
Processing goes as follows:
- Search for
... as ( select ... )
CTE statement via the [^\s]+\s+AS\s+\(\s+SELECT
regular expression. - Read each character from there, keeping count of opening/closing brackets; once this number reaches zero (or we seeked to end of the query) we are done with the subquery.
- Store the subquery under the CTE name.
- Recursively search for new CTE statements within the subquery, if any.
- Move on to the next subquery.
- Extract the main query, if any, using the following regular expressions (these could be factored a bit further but clarity prevails):
CREATE\s+EXTERNAL\s+TABLE\s+([^\s]+)
CREATE\s+TABLE\s([^\s]+)
CREATE\s+MATERIALIZED\s+VIEW\s([^\s]+)
CREATE\+OR\s+REPLACE\s+VIEW\s([^\s]+)
CREATE\s+VIEW\s([^\s]+)
source
def split_query(query: str) -> dict[str, str]:
r"""Split a query in its subqueries, if any.
Parameters
----------
query : str
The DDL to parse.
Returns
-------
: dict[str, str]
Dictionary of [sub]queries and associated DDL, split in parts if the `union`
keyword is found.
Notes
-----
Processing goes as follows:
1. Search for `... as ( select ... )` CTE statement via the
`[^\s]+\s+AS\s+\(\s+SELECT` regular expression.
2. Read each character from there, keeping count of opening/closing brackets; once
this number reaches zero (or we seeked to end of the query) we are done with the
subquery.
3. Store the subquery under the CTE name.
4. Recursively search for new CTE statements within the subquery, if any.
5. Move on to the next subquery.
6. Extract the main query, if any, using the following regular expressions (these
could be factored a bit further but clarity prevails):
* `CREATE\s+EXTERNAL\s+TABLE\s+([^\s]+)`
* `CREATE\s+TABLE\s([^\s]+)`
* `CREATE\s+MATERIALIZED\s+VIEW\s([^\s]+)`
* `CREATE\+OR\s+REPLACE\s+VIEW\s([^\s]+)`
* `CREATE\s+VIEW\s([^\s]+)`
"""
# recursively extract subqueries
# make sure we start from empty parts
query, parts = _split(query, {})
maxn = len(parts)
# extract main query, if any (if the query does not generate any object, this step
# returns nothing)
for r in (
r"create\s+external\s+table\s+([^\s]+)",
r"create\s+table\s([^\s]+)",
r"create\s+materialized\s+view\s+([^\s]+)",
r"create\s+or\s+replace\s+view\s+([^\s]+)",
r"create\s+view\s+([^\s]+)",
):
if (m := re.search(r, query, flags=re.IGNORECASE)) is not None:
parts[m.group(1)] = re.sub(
r"select\s+(.*?)\s+from", "select %COLUMNS% from", query
)
break
# if not object was found, we still want to analyze the last statement
if len(parts) == maxn:
parts["SELECT"] = re.sub(
r"select\s+(.*?)\s+from", "select %COLUMNS% from", query
)
# clean out unwanted objects (containing our "%SUBQUERY:" keyword for instance,
# product of using extra brackets and the imperfect regular expressions above)
for k in list(parts.keys()):
if "%SUBQUERY:" in k:
parts.pop(k)
return parts
sql_to_json.fetch_dependencies
fetch_dependencies(parts: dict[str, str]) -> dict[str, list[str]]:
Fetch upstream dependencies from each subquery.
Parameters
parts
[dict[str, list[str]]
]: Dictionary of [sub]queries and associated DDL.
Returns
- [
dict[str, list[str]]
]: Dictionary of objects and associated list of upstream dependencies.
Notes
Supported regular expressions (e.g., SQL statements):
FROM\s+([^\s(]+)
JOIN\s+([^\s(]+)
LOCATION\s+'(s3://.+)'
(Redshift
stuff)
source
def fetch_dependencies(parts: dict[str, str]) -> dict[str, list[str]]:
r"""Fetch upstream dependencies from each subquery.
Parameters
----------
parts : dict[str, list[str]]
Dictionary of [sub]queries and associated DDL.
Returns
-------
: dict[str, list[str]]
Dictionary of objects and associated list of upstream dependencies.
Notes
-----
Supported regular expressions (_e.g._, SQL statements):
1. `FROM\s+([^\s(]+)`
2. `JOIN\s+([^\s(]+)`
3. `LOCATION\s+'(s3://.+)'` (`Redshift` stuff)
"""
tree: dict[str, list[str]] = {}
# iterate over each object -> associated subqueries
for n, p in parts.items():
if any(f" {k} " in p.lower() for k in ("from", "join", "location")):
for r in (
r"\s+from\s+([^\s(]+)",
r"\s+join\s+([^\s(]+)",
r"\s+location\s+'(s3://.+)'",
):
for m in re.finditer(r, p, flags=re.IGNORECASE):
if n in tree:
if m.group(1) not in tree[n]:
tree[n].append(m.group(1))
else:
tree[n] = [m.group(1)]
else:
tree[n] = []
# order the dependencies
tree[n].sort()
return tree
Module test_sql_to_json
Some test regarding our little SQL parsing.
Functions
Functions
test_sql_to_json.test_convoluted_query
test_convoluted_query() -> None:
Test a convoluted query, including subqueries and subsubqueries.
The following query contains:
- subqueries defined through
WITH
, - subqueries within subqueries (...),
- subqueries without
FROM
or JOIN
, - a function including the
FROM
keyword, FROM
, JOIN
, UNION
.
with
subquery1 as (
select
attr1,
attr2
from (
with
subsubquery1 as (
select
t1.attr1,
t2.attr2
from table1 t1
inner join table2 t2
on t1.attr = t2.attr
),
subsubquery2 as (
select
attr1,
attr2
from table3
)
select * from subsubquery1
union all
select * from subsubquery2
)
where attr1 <> 0
and attr2 is not null
),
subquery2 (
select
s1.*,
t4.*
from table4 t4
left outer join subquery1 s1
on t4.attr1 = s1.attr1 and t4.attr2 > 0
inner join table5 t5
on t4.attr1 = t5.attr1
where t5.valid_from is not null
),
subquery3 as (
select
trim('"' from attr1) as attr1,
'2' as attr2
)
select
s2.attr1,
s2.attr2
from subquery2 s2
cross join subquery3 s3;
Below the query diagram:
graph LR
%% nodes
node1(table1)
node2(subsubquery1)
node3(table2)
node4(table3)
node5(subsubquery2)
node6(subquery1)
node7(subquery2)
node8(table4)
node9(table5)
node10(SELECT)
node11(subquery3)
%% links
node1 --- node2
node3 --- node2
node4 --- node5
node2 --- node6
node5 --- node6
node6 --- node7
node8 --- node7
node9 --- node7
node7 --- node10
node11 --- node10
%% style
linkStyle default fill:none,stroke-width:1px
Note the final SELECT
statement is indicated as a node in itself, despite not being an object.source
def test_convoluted_query() -> None:
"""Test a convoluted query, including subqueries and subsubqueries.
The following query contains:
* subqueries defined through `WITH`,
* subqueries within subqueries (...),
* subqueries without `FROM` or `JOIN`,
* a function including the `FROM` keyword,
* `FROM`, `JOIN`, `UNION`.
```sql
with
subquery1 as (
select
attr1,
attr2
from (
with
subsubquery1 as (
select
t1.attr1,
t2.attr2
from table1 t1
inner join table2 t2
on t1.attr = t2.attr
),
subsubquery2 as (
select
attr1,
attr2
from table3
)
select * from subsubquery1
union all
select * from subsubquery2
)
where attr1 <> 0
and attr2 is not null
),
subquery2 (
select
s1.*,
t4.*
from table4 t4
left outer join subquery1 s1
on t4.attr1 = s1.attr1 and t4.attr2 > 0
inner join table5 t5
on t4.attr1 = t5.attr1
where t5.valid_from is not null
),
subquery3 as (
select
trim('"' from attr1) as attr1,
'2' as attr2
)
select
s2.attr1,
s2.attr2
from subquery2 s2
cross join subquery3 s3;
```
Below the query diagram:
```mermaid
graph LR
%% nodes
node1(table1)
node2(subsubquery1)
node3(table2)
node4(table3)
node5(subsubquery2)
node6(subquery1)
node7(subquery2)
node8(table4)
node9(table5)
node10(SELECT)
node11(subquery3)
%% links
node1 --- node2
node3 --- node2
node4 --- node5
node2 --- node6
node5 --- node6
node6 --- node7
node8 --- node7
node9 --- node7
node7 --- node10
node11 --- node10
%% style
linkStyle default fill:none,stroke-width:1px
```
Note the final `SELECT` statement is indicated as a node in itself, despite not
being an object.
"""
rq = """
with
subquery1 as (
select
attr1,
attr2
from (
with
subsubquery1 as (
select
t1.attr1,
t2.attr2
from table1 t1
inner join table2 t2
on t1.attr = t2.attr
),
subsubquery2 as (
select
attr1,
attr2
from table3
)
select * from subsubquery1
union all
select * from subsubquery2
)
where attr1 <> 0
and attr2 is not null
),
subquery2 as (
select
s1.*,
t4.*
from table4 t4
left outer join subquery1 s1
on t4.attr1 = s1.attr1
inner join table5 t5
on t4.attr1 = t5.attr1
where t5.valid_from is not null
),
subquery3 as (
select
trim('"' from attr1) as attr1,
'2' as attr2
)
select
s2.attr1,
s2.attr2
from subquery2 s2
cross join subquery3 s3;
"""
q, s, d = _process(rq)
assert d == {
"subsubquery1": ["table1", "table2"],
"subsubquery2": ["table3"],
"subquery1": ["subsubquery1", "subsubquery2"],
"subquery2": ["subquery1", "table4", "table5"],
"subquery3": [],
"SELECT": ["subquery2", "subquery3"],
}
test_sql_to_json.test_create_external_table
test_create_external_table() -> None:
Test for the CREATE EXTERNAL TABLE
and LOCATION
statements.
create external table external_table (
attr1 timestamp,
attr2 varchar(32),
attr3 smallint
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://bucket/key/_symlink_format_manifest';
source
def test_create_external_table() -> None:
"""Test for the `CREATE EXTERNAL TABLE` and `LOCATION` statements.
```sql
create external table external_table (
attr1 timestamp,
attr2 varchar(32),
attr3 smallint
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://bucket/key/_symlink_format_manifest';
```
"""
rq = """
CREATE EXTERNAL TABLE external_table (
attr1 timestamp,
attr2 varchar(32),
attr3 smallint
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://bucket/key/_symlink_format_manifest';
"""
q, s, d = _process(rq)
assert d == {"external_table": ["s3://bucket/key/_symlink_format_manifest"]}
test_sql_to_json.test_create_materialized_view
test_create_materialized_view() -> None:
Test for CREATE MATERIALIZED VIEW
statement.
create materialized view materialized_view as (select * from external_table)
create materialized view materialized_view
backup no diststyle key distkey (attr) sortkey (attr1, attr2) as
select * from external_table;
source
def test_create_materialized_view() -> None:
"""Test for `CREATE MATERIALIZED VIEW` statement.
```sql
create materialized view materialized_view as (select * from external_table)
```
```sql
create materialized view materialized_view
backup no diststyle key distkey (attr) sortkey (attr1, attr2) as
select * from external_table;
```
"""
for rq in (
"create materialized view materialized_view as (select * from external_table)",
(
"create materialized view materialized_view "
"backup no diststyle key distkey (attr) sortkey (attr1, attr2) as "
"select * from external_table;"
),
):
q, s, d = _process(rq)
assert d == {"materialized_view": ["external_table"]}
test_sql_to_json.test_create_table
test_create_table() -> None:
Test for CREATE TABLE
statement.
create table table2 as select * from table1
source
def test_create_table() -> None:
"""Test for `CREATE TABLE` statement.
```sql
create table table2 as select * from table1
```
"""
q, s, d = _process("create table table2 as select * from table1")
assert s == {"table2": "create table table2 as select %COLUMNS% from table1"}
assert d == {"table2": ["table1"]}
test_sql_to_json.test_create_view
test_create_view() -> None:
Test for CREATE [OR REPLACE] VIEW
statements.
create or replace view simple_view as select * from static_table
create view simple_view as select * from static_table
source
def test_create_view() -> None:
"""Test for `CREATE [OR REPLACE] VIEW` statements.
```sql
create or replace view simple_view as select * from static_table
```
```sql
create view simple_view as select * from static_table
```
"""
for rq in (
"create or replace view simple_view as select * from static_table",
"create view simple_view as select * from static_table",
):
q, s, d = _process(rq)
assert d == {"simple_view": ["static_table"]}
test_sql_to_json.test_false_positive_from
test_false_positive_from() -> None:
Test the exclusion of ..._from
names or FUNCTION(... FROM ...)
statements.
select * from table t
right join valid_from vf
on t.attr = vf.attr and extract(month from vf.datetime) > 6
select
extract(year from datetime),
extract(month from to_timestamp(trim('"' from string), 'YYYY-MM-DD HH:MI:SS.FF'))
from table
source
def test_false_positive_from() -> None:
"""Test the exclusion of `..._from` names or `FUNCTION(... FROM ...)` statements.
```sql
select * from table t
right join valid_from vf
on t.attr = vf.attr and extract(month from vf.datetime) > 6
```
```sql
select
extract(year from datetime),
extract(month from to_timestamp(trim('"' from string), 'YYYY-MM-DD HH:MI:SS.FF'))
from table
```
"""
# first
rq = """
select * from table t
right join valid_from vf
on t.attr = vf.attr and extract(month from vf.datetime) > 6
"""
q, s, d = _process(rq)
assert d == {"SELECT": ["table", "valid_from"]}
# second
q = """
select
extract(year from datetime),
extract(month from to_timestamp(trim('"' from string), 'YYYY-MM-DD HH:MI:SS.FF'))
from table
"""
q, s, d = _process(q)
assert s == {"SELECT": "select %COLUMNS% from table"}
assert d == {"SELECT": ["table"]}
test_sql_to_json.test_subqueries
test_subqueries() -> None:
Test for subqueries (CTE), e.g., statement including a WITH
clause.
with
subquery1 as (
select
t1.attr1,
t2.attr2
from table1 t1
inner join table2 t2
on t1.attr = t2.attr
),
subquery2 as (
select
attr1,
attr2
from table3
)
select * from subquery1 s1
left join (select * from subquery2) s2
on s1.attr1 = s2.attr1
with
subquery1 as (
select extract(day from attr) as day
from table1
),
subquery2 as (
select attr
from table2 t2
left join table3
on t2.attr = t3.attr
),
subquery3 as (
select attr
from table4
)
select distinct
s1.attr,
s2.attr,
s3.attr
from subquery1 s1
inner join subquery2 s2
on s1.attr = s2.attr
right join subquery3 s3
on s2.attr = s3.attr
source
def test_subqueries() -> None:
"""Test for subqueries (CTE), _e.g._, statement including a `WITH` clause.
```sql
with
subquery1 as (
select
t1.attr1,
t2.attr2
from table1 t1
inner join table2 t2
on t1.attr = t2.attr
),
subquery2 as (
select
attr1,
attr2
from table3
)
select * from subquery1 s1
left join (select * from subquery2) s2
on s1.attr1 = s2.attr1
```
```sql
with
subquery1 as (
select extract(day from attr) as day
from table1
),
subquery2 as (
select attr
from table2 t2
left join table3
on t2.attr = t3.attr
),
subquery3 as (
select attr
from table4
)
select distinct
s1.attr,
s2.attr,
s3.attr
from subquery1 s1
inner join subquery2 s2
on s1.attr = s2.attr
right join subquery3 s3
on s2.attr = s3.attr
```
"""
# 1
q = """
with
subquery1 as (
select
t1.attr1,
t2.attr2
from table1 t1
inner join table2 t2
on t1.attr = t2.attr
),
subquery2 as (
select
attr1,
attr2
from table3
)
select * from subquery1 s1
left join (select * from subquery2) s2
on s1.attr1 = s2.attr1
"""
q, s, d = _process(q)
assert d == {
"subquery1": ["table1", "table2"],
"subquery2": ["table3"],
"SELECT": ["subquery1", "subquery2"],
}
# 2
q = """
with
subquery1 as (
select extract(day from attr) as day
from table1
),
subquery2 as (
select attr
from table2 t2
left join table3
on t2.attr = t3.attr
),
subquery3 as (
select attr
from table4
)
select distinct
s1.attr,
s2.attr,
s3.attr
from subquery1 s1
inner join subquery2 s2
on s1.attr = s2.attr
right join subquery3 s3
on s2.attr = s3.attr
"""
q, s, d = _process(q)
assert d == {
"subquery1": ["table1"],
"subquery2": ["table2", "table3"],
"subquery3": ["table4"],
"SELECT": ["subquery1", "subquery2", "subquery3"],
}