Module test_parser
Assert capabilities of the schema parser.
Functions
Functions
test_parser.test_datatype
test_datatype(text: str, struct: pl.Struct) -> None:
Test all supported standalone non-nesting datatypes and associated shorthands.
Parameters
text [str]: Schema in plain text.struct [polars.Struct]: Expected datatype.
Decoration via @pytest.mark.parametrize().source
def test_datatype(text: str, struct: pl.Struct) -> None:
"""Test all supported standalone non-nesting datatypes and associated shorthands.
Parameters
----------
text : str
Schema in plain text.
struct : polars.Struct
Expected datatype.
"""
assert SchemaParser(text).to_struct() == struct
test_parser.test_datatype_nested
test_datatype_nested(text: str, struct: pl.Struct) -> None:
Test nesting datatypes.
Parameters
text [str]: Schema in plain text.struct [polars.Struct]: Expected datatype.
Decoration via @pytest.mark.parametrize().source
def test_datatype_nested(text: str, struct: pl.Struct) -> None:
"""Test nesting datatypes.
Parameters
----------
text : str
Schema in plain text.
struct : polars.Struct
Expected datatype.
"""
assert SchemaParser(text).to_struct() == struct
test_parser.test_delimiter
test_delimiter(text: str, struct: pl.Struct) -> None:
Test nested structure delimiters: (), [], {} or <>.
Parameters
text [str]: Schema in plain text.struct [polars.Struct]: Expected datatype.
Decoration via @pytest.mark.parametrize().source
def test_delimiter(text: str, struct: pl.Struct) -> None:
"""Test nested structure delimiters: `()`, `[]`, `{}` or `<>`.
Parameters
----------
text : str
Schema in plain text.
struct : polars.Struct
Expected datatype.
"""
assert SchemaParser(text).to_struct() == struct
test_parser.test_list_nested_in_list
test_list_nested_in_list() -> None:
Test the parsing of a polars.List within a polars.List.
Test the generation of the following schema:
List(List(Int8))
source
def test_list_nested_in_list() -> None:
"""Test the parsing of a `polars.List` within a `polars.List`.
Test the generation of the following schema:
```
List(List(Int8))
```
"""
struct = pl.Struct([pl.List(pl.List(pl.Int8))])
assert SchemaParser("List(List(Int8))").to_struct() == struct
test_parser.test_list_nested_in_struct
test_list_nested_in_struct() -> None:
Test the parsing of a polars.List within a polars.Struct.
Test the generation of the following schema:
Struct(
foo: List(Int8)
)
source
def test_list_nested_in_struct() -> None:
"""Test the parsing of a `polars.List` within a `polars.Struct`.
Test the generation of the following schema:
```
Struct(
foo: List(Int8)
)
```
"""
struct = pl.Struct([pl.Field("", pl.Struct([pl.Field("foo", pl.List(pl.Int8))]))])
assert SchemaParser("Struct(foo: List(Int8))").to_struct() == struct
test_parser.test_pretty_printing
test_pretty_printing() -> None:
Test whether an inferred schema is correctly printed.source
def test_pretty_printing() -> None:
"""Test whether an inferred schema is correctly printed."""
with pathlib.Path("tests/samples/nested-list.schema").open() as f:
assert infer_schema("tests/samples/nested-list.ndjson") == f.read().strip()
test_parser.test_real_life
test_real_life() -> None:
Test complex schema.
Test the following nested JSON content:
{
"headers": {
"timestamp": 1372182309,
"source": "Online.Transactions",
"offset": 123456789
},
"payload": {
"transaction": "inbound",
"location": 765,
"customer": {
"type": "REGISTERED",
"identifier": "a8098c1a-f86e-11da-bd1a-00112444be1e"
},
"lines": [
{
"product": 76543,
"description": "Toilet plunger",
"quantity": 2,
"vatRate": 0.21,
"amount": {
"includingVat": 10.0,
"excludingVat": 8.26,
"vat": 1.74,
"currency": "EUR"
},
"discounts": [
{
"promotion": 100023456000789,
"description": "Buy one get two",
"amount": {
"includingVat": 10.0,
"excludingVat": 8.26,
"vat": 1.74,
"currency": "EUR"
}
}
]
},
{
"product": 3456,
"description": "Toilet cap",
"quantity": 1,
"vatRate": 0.21,
"amount": {
"includingVat": 30.0,
"excludingVat": 24.79,
"vat": 5.21,
"currency": "EUR"
}
}
],
"payment": {
"method": "Card",
"company": "OnlineBanking",
"identifier": 123456789,
"amount": {
"includingVat": 40.0,
"excludingVat": 33.05,
"vat": 6.95,
"currency": "EUR"
}
}
}
}
as described by the following schema:
headers: Struct<
timestamp: Int64
source: String
offset: Int64
>
payload: Struct<
transaction=transaction_type: String
location: Int64
customer: Struct{
type=customer_type: String
identifier=customer_identifier: String
}
lines: List[
Struct{
product: Int64
description=product_description: String
quantity: Int64
vatRate=vat_rate: Float64
amount: Struct(
includingVat=line_amount_including_vat: Float64
excludingVat=line_amount_excluding_vat: Float64
vat=line_amount_vat: Float64
currency=line_amount_currency: String
)
discounts: List[
Struct{
promotion: Int64
description=promotion_description: String
amount: Struct{
includingVat=discount_amount_including_vat: Float64
excludingVat=discount_amount_excluding_vat: Float64
vat=discount_amount_vat: Float64
currency=discount_amount_currency: String
}
}
]
}
]
payment: Struct{
method: String
company: String
identifier=transaction_identifier: Int64
amount: Struct{
includingVat=total_amount_including_vat: Float32
excludingVat=total_amount_excluding_vat: Float32
vat=total_amount_vat: Float32
currency=total_amount_currency: String
}
}
>
source
def test_real_life() -> None:
"""Test complex schema.
Test the following nested JSON content:
```json
{
"headers": {
"timestamp": 1372182309,
"source": "Online.Transactions",
"offset": 123456789
},
"payload": {
"transaction": "inbound",
"location": 765,
"customer": {
"type": "REGISTERED",
"identifier": "a8098c1a-f86e-11da-bd1a-00112444be1e"
},
"lines": [
{
"product": 76543,
"description": "Toilet plunger",
"quantity": 2,
"vatRate": 0.21,
"amount": {
"includingVat": 10.0,
"excludingVat": 8.26,
"vat": 1.74,
"currency": "EUR"
},
"discounts": [
{
"promotion": 100023456000789,
"description": "Buy one get two",
"amount": {
"includingVat": 10.0,
"excludingVat": 8.26,
"vat": 1.74,
"currency": "EUR"
}
}
]
},
{
"product": 3456,
"description": "Toilet cap",
"quantity": 1,
"vatRate": 0.21,
"amount": {
"includingVat": 30.0,
"excludingVat": 24.79,
"vat": 5.21,
"currency": "EUR"
}
}
],
"payment": {
"method": "Card",
"company": "OnlineBanking",
"identifier": 123456789,
"amount": {
"includingVat": 40.0,
"excludingVat": 33.05,
"vat": 6.95,
"currency": "EUR"
}
}
}
}
```
as described by the following schema:
```
headers: Struct<
timestamp: Int64
source: String
offset: Int64
>
payload: Struct<
transaction=transaction_type: String
location: Int64
customer: Struct{
type=customer_type: String
identifier=customer_identifier: String
}
lines: List[
Struct{
product: Int64
description=product_description: String
quantity: Int64
vatRate=vat_rate: Float64
amount: Struct(
includingVat=line_amount_including_vat: Float64
excludingVat=line_amount_excluding_vat: Float64
vat=line_amount_vat: Float64
currency=line_amount_currency: String
)
discounts: List[
Struct{
promotion: Int64
description=promotion_description: String
amount: Struct{
includingVat=discount_amount_including_vat: Float64
excludingVat=discount_amount_excluding_vat: Float64
vat=discount_amount_vat: Float64
currency=discount_amount_currency: String
}
}
]
}
]
payment: Struct{
method: String
company: String
identifier=transaction_identifier: Int64
amount: Struct{
includingVat=total_amount_including_vat: Float32
excludingVat=total_amount_excluding_vat: Float32
vat=total_amount_vat: Float32
currency=total_amount_currency: String
}
}
>
```
"""
dtype = parse_schema("tests/samples/complex.schema").struct
df = pl.scan_ndjson("tests/samples/complex.ndjson").collect()
assert dtype.to_schema() == df.schema
test_parser.test_struct_nested_in_list
test_struct_nested_in_list() -> None:
Test the parsing of a polars.Struct within a polars.List.
Test the generation of the following schema:
List(
Struct(
foo: Int8,
bar: Int8
)
)
Notes
It seems Polars only accepts input starting with {, but not [ (such as a JSON lists); although the schema described above is valid in a JSON sense, the associated data will not be ingested by Polars.source
def test_struct_nested_in_list() -> None:
"""Test the parsing of a `polars.Struct` within a `polars.List`.
Test the generation of the following schema:
```
List(
Struct(
foo: Int8,
bar: Int8
)
)
```
Notes
-----
It seems `Polars` only accepts input starting with `{`, but not `[` (such as a JSON
lists); although the schema described above is valid in a JSON sense, the associated
data will not be ingested by `Polars`.
"""
struct = pl.Struct(
[pl.List(pl.Struct([pl.Field("foo", pl.Int8), pl.Field("bar", pl.Int8)]))],
)
assert SchemaParser("List(Struct(foo: Int8, bar: Int8))").to_struct() == struct
test_parser.test_struct_nested_in_struct
test_struct_nested_in_struct() -> None:
Test the parsing of a polars.Struct within a polars.Struct.
Test the generation of the following schema:
Struct(
foo: Struct(
bar: Int8
)
)
source
def test_struct_nested_in_struct() -> None:
"""Test the parsing of a `polars.Struct` within a `polars.Struct`.
Test the generation of the following schema:
```
Struct(
foo: Struct(
bar: Int8
)
)
```
"""
struct = pl.Struct(
[
pl.Field(
"",
pl.Struct([pl.Field("foo", pl.Struct([pl.Field("bar", pl.Int8)]))]),
),
],
)
assert SchemaParser("Struct(foo: Struct(bar: Int8))").to_struct() == struct
test_parser.test_unexpected_duplication
test_unexpected_duplication() -> None:
Test for duplicated column name (including after column renaming).source
def test_unexpected_duplication() -> None:
"""Test for duplicated column name (including after column renaming)."""
with pytest.raises(DuplicateColumnError):
SchemaParser("Struct(foo: Int8, foo: Float32)").to_struct()
with pytest.raises(DuplicateColumnError):
SchemaParser("Struct(foo: Int8, bar=foo: Float32)").to_struct()
test_parser.test_unexpected_renaming
test_unexpected_renaming() -> None:
Test for JSON path renaming (unsupported, and quite useless as well).source
def test_unexpected_renaming() -> None:
"""Test for JSON path renaming (unsupported, and quite useless as well)."""
with pytest.raises(PathRenamingError):
SchemaParser("this=that:Struct(foo:Int8)").to_struct()
test_parser.test_unexpected_syntax
test_unexpected_syntax() -> None:
Test for failure to parse the schema due to unknown/unexpected syntax.source
def test_unexpected_syntax() -> None:
"""Test for failure to parse the schema due to unknown/unexpected syntax."""
with pytest.raises(SchemaParsingError):
SchemaParser("!@#$%^&*").to_struct()
with pytest.raises(SchemaParsingError):
SchemaParser("Struct(!@#$%^&*)").to_struct()
test_parser.test_unknown_datatype
test_unknown_datatype() -> None:
Test for unknown datatype.source
def test_unknown_datatype() -> None:
"""Test for unknown datatype."""
with pytest.raises(UnknownDataTypeError):
SchemaParser("Foo").to_struct()
with pytest.raises(UnknownDataTypeError):
SchemaParser("Struct(foo: Bar)").to_struct()
with pytest.raises(UnknownDataTypeError):
SchemaParser("Struct(foo=fox: Bar)").to_struct()
Module test_unpacker
Assert capabilities of the DataFrame / LazyFrame flattener.
Functions
Functions
test_unpacker.test_datatype
test_datatype() -> None:
Test a standalone datatype.
Test the following JSON content:
1
as described by the following schema:
Int64
source
def test_datatype() -> None:
"""Test a standalone datatype.
Test the following JSON content:
```json
1
```
as described by the following schema:
```
Int64
```
"""
dtype = pl.Struct(
[
pl.Field("", pl.Int64),
],
)
df = pl.DataFrame([0, 1, 2, 3], dtype)
# tested in the other module but might as well...
assert SchemaParser("Int64").to_struct() == dtype
assert dtype.to_schema() == df.schema
assert df.json.unpack(dtype).equals(df)
test_unpacker.test_list
test_list() -> None:
Test a simple polars.List containing a standalone datatype.
Test the following nested JSON content:
{
"text": "foobar",
"json": [
0,
1,
2,
3
]
}
as described by the following schema:
text: String,
json: List(Int64)
source
def test_list() -> None:
"""Test a simple `polars.List` containing a standalone datatype.
Test the following nested JSON content:
```json
{
"text": "foobar",
"json": [
0,
1,
2,
3
]
}
```
as described by the following schema:
```
text: String,
json: List(Int64)
```
"""
dtype = pl.Struct(
[
pl.Field("text", pl.String),
pl.Field("json", pl.List(pl.Int64)),
],
)
df = pl.DataFrame(
{
"text": "foobar",
"json": json.loads("[[0, 1, 2, 3]]"),
},
dtype,
)
assert SchemaParser("text:String,json:List(Int64)").to_struct() == dtype
assert dtype.to_schema() == df.schema
assert df.json.unpack(dtype).equals(df.explode("json"))
test_unpacker.test_list_nested_in_list_nested_in_list
test_list_nested_in_list_nested_in_list() -> None:
Test a polars.List nested in parent polars.Lists.
Test the following nested JSON content:
{
"text": "foobar",
"json": [
[
[
[10, 12],
[11, 13]
],
[
[30, 32],
[31, 33]
]
],
[
[
[20, 22],
[21, 23]
],
[
[40, 42],
[41, 43]
]
]
]
}
as described by the following schema:
text: String,
json: List(List(List(Int64)))
source
def test_list_nested_in_list_nested_in_list() -> None:
"""Test a `polars.List` nested in parent `polars.List`s.
Test the following nested JSON content:
```json
{
"text": "foobar",
"json": [
[
[
[10, 12],
[11, 13]
],
[
[30, 32],
[31, 33]
]
],
[
[
[20, 22],
[21, 23]
],
[
[40, 42],
[41, 43]
]
]
]
}
```
as described by the following schema:
```
text: String,
json: List(List(List(Int64)))
```
"""
dtype = pl.Struct(
[
pl.Field("text", pl.String),
pl.Field(
"json",
pl.List(
pl.List(
pl.List(pl.Int64),
),
),
),
],
)
df = pl.DataFrame(
{
"text": "foobar",
"json": json.loads(
"[[[[10, 12], [11, 13]], [[30, 32], [31, 33]]],"
" [[[20, 22], [21, 23]], [[40, 42], [41, 43]]]]",
),
},
dtype,
)
assert SchemaParser("text:String,json:List(List(List(Int64)))").to_struct() == dtype
assert dtype.to_schema() == df.schema
assert (
df.json.unpack(dtype)
.rename({"json.json.json.json": "json"})
.equals(
df.explode("json").explode("json").explode("json"),
)
)
test_unpacker.test_list_nested_in_struct
test_list_nested_in_struct() -> None:
Test a polars.List nested in a polars.Struct.
Test the following nested JSON content:
{
"text": "foobar",
"json": {
"foo": {
"fox": 0,
"foz": 2
},
"bar": [
1,
3
]
}
}
as described by the following schema:
text: String,
json: Struct(
foo: Struct(
fox: Int64,
foz: Int64
),
bar: List(Int64)
)
source
def test_list_nested_in_struct() -> None:
"""Test a `polars.List` nested in a `polars.Struct`.
Test the following nested JSON content:
```json
{
"text": "foobar",
"json": {
"foo": {
"fox": 0,
"foz": 2
},
"bar": [
1,
3
]
}
}
```
as described by the following schema:
```
text: String,
json: Struct(
foo: Struct(
fox: Int64,
foz: Int64
),
bar: List(Int64)
)
```
"""
dtype = pl.Struct(
[
pl.Field("text", pl.String),
pl.Field(
"json",
pl.Struct(
[
pl.Field(
"foo",
pl.Struct(
[pl.Field("fox", pl.Int64), pl.Field("foz", pl.Int64)],
),
),
pl.Field("bar", pl.List(pl.Int64)),
],
),
),
],
)
df = pl.DataFrame(
{
"text": ["foobar"],
"json": [
json.loads(
'{"foo": {"fox": 0, "foz": 2}, "bar": [1, 3]}',
),
],
},
dtype,
)
assert (
SchemaParser(
"text:String,json:Struct(foo:Struct(fox:Int64,foz:Int64),bar:List(Int64))",
).to_struct()
== dtype
)
assert dtype.to_schema() == df.schema
assert df.json.unpack(dtype).equals(
df.unnest("json")
.unnest("foo")
.explode("bar")
.rename({"fox": "json.foo.fox", "foz": "json.foo.foz", "bar": "json.bar"}),
)
test_unpacker.test_real_life
test_real_life(df: pl.DataFrame) -> None:
Test complex real life-like parsing and flattening.
Test the following nested JSON content:
{
"headers": {
"timestamp": 1372182309,
"source": "Online.Transactions",
"offset": 123456789,
},
"payload": {
"transaction": "inbound",
"location": 765,
"customer": {
"type": "REGISTERED",
"identifier": "a8098c1a-f86e-11da-bd1a-00112444be1e"
},
"lines": [
{
"product": 76543,
"description": "Toilet plunger",
"quantity": 2,
"vatRate": 0.21,
"amount": {
"includingVat": 10.0,
"excludingVat": 8.26,
"vat": 1.74,
"currency": "EUR"
},
"discounts": [
{
"promotion": 100023456000789,
"description": "Buy one get two",
"amount": {
"includingVat": 10.0,
"excludingVat": 8.26,
"vat": 1.74,
"currency": "EUR"
}
}
]
},
{
"product": 3456,
"description": "Toilet cap",
"quantity": 1,
"vatRate": 0.21,
"amount": {
"includingVat": 30.0,
"excludingVat": 24.79,
"vat": 5.21,
"currency": "EUR"
}
}
],
"payment": {
"method": "Card",
"company": "OnlineBanking",
"identifier": 123456789,
"amount": {
"includingVat": 40.0,
"excludingVat": 33.05,
"vat": 6.95,
"currency": "EUR"
}
}
}
}
as described by the following schema:
headers: Struct<
timestamp: Int64
source: String
offset: Int64
>
payload: Struct<
transaction=transaction_type: String
location: Int64
customer: Struct{
type=customer_type: String
identifier=customer_identifier: String
}
lines: List[
Struct{
product: Int64
description=product_description: String
quantity: Int64
vatRate=vat_rate: Float64
amount: Struct(
includingVat=line_amount_including_vat: Float64
excludingVat=line_amount_excluding_vat: Float64
vat=line_amount_vat: Float64
currency=line_amount_currency: String
)
discounts: List[
Struct{
promotion: Int64
description=promotion_description: String
amount: Struct{
includingVat=discount_amount_including_vat: Float64
excludingVat=discount_amount_excluding_vat: Float64
vat=discount_amount_vat: Float64
currency=discount_amount_currency: String
}
}
]
}
]
payment: Struct{
method: String
company: String
identifier=transaction_identifier: Int64
amount: Struct{
includingVat=total_amount_including_vat: Float64
excludingVat=total_amount_excluding_vat: Float64
vat=total_amount_vat: Float64
currency=total_amount_currency: String
}
}
>
Parameters
df [polars.DataFrame]: Unpacked Polars DataFrame.
Decoration via @pytest.mark.parametrize().source
def test_real_life(df: pl.DataFrame) -> None:
"""Test complex real life-like parsing and flattening.
Test the following nested JSON content:
```json
{
"headers": {
"timestamp": 1372182309,
"source": "Online.Transactions",
"offset": 123456789,
},
"payload": {
"transaction": "inbound",
"location": 765,
"customer": {
"type": "REGISTERED",
"identifier": "a8098c1a-f86e-11da-bd1a-00112444be1e"
},
"lines": [
{
"product": 76543,
"description": "Toilet plunger",
"quantity": 2,
"vatRate": 0.21,
"amount": {
"includingVat": 10.0,
"excludingVat": 8.26,
"vat": 1.74,
"currency": "EUR"
},
"discounts": [
{
"promotion": 100023456000789,
"description": "Buy one get two",
"amount": {
"includingVat": 10.0,
"excludingVat": 8.26,
"vat": 1.74,
"currency": "EUR"
}
}
]
},
{
"product": 3456,
"description": "Toilet cap",
"quantity": 1,
"vatRate": 0.21,
"amount": {
"includingVat": 30.0,
"excludingVat": 24.79,
"vat": 5.21,
"currency": "EUR"
}
}
],
"payment": {
"method": "Card",
"company": "OnlineBanking",
"identifier": 123456789,
"amount": {
"includingVat": 40.0,
"excludingVat": 33.05,
"vat": 6.95,
"currency": "EUR"
}
}
}
}
```
as described by the following schema:
```
headers: Struct<
timestamp: Int64
source: String
offset: Int64
>
payload: Struct<
transaction=transaction_type: String
location: Int64
customer: Struct{
type=customer_type: String
identifier=customer_identifier: String
}
lines: List[
Struct{
product: Int64
description=product_description: String
quantity: Int64
vatRate=vat_rate: Float64
amount: Struct(
includingVat=line_amount_including_vat: Float64
excludingVat=line_amount_excluding_vat: Float64
vat=line_amount_vat: Float64
currency=line_amount_currency: String
)
discounts: List[
Struct{
promotion: Int64
description=promotion_description: String
amount: Struct{
includingVat=discount_amount_including_vat: Float64
excludingVat=discount_amount_excluding_vat: Float64
vat=discount_amount_vat: Float64
currency=discount_amount_currency: String
}
}
]
}
]
payment: Struct{
method: String
company: String
identifier=transaction_identifier: Int64
amount: Struct{
includingVat=total_amount_including_vat: Float64
excludingVat=total_amount_excluding_vat: Float64
vat=total_amount_vat: Float64
currency=total_amount_currency: String
}
}
>
```
Parameters
----------
df : polars.DataFrame
Unpacked `Polars` `DataFrame`.
"""
df_csv = pl.scan_csv(
"tests/samples/complex.csv",
schema_overrides={
"timestamp": pl.Int64,
"source": pl.String,
"offset": pl.Int64,
"transaction_type": pl.String,
"location": pl.Int64,
"customer_type": pl.String,
"customer_identifier": pl.String,
"product": pl.Int64,
"product_description": pl.String,
"quantity": pl.Int64,
"vat_rate": pl.Float64,
"line_amount_including_vat": pl.Float64,
"line_amount_excluding_vat": pl.Float64,
"line_amount_vat": pl.Float64,
"line_amount_currency": pl.String,
"promotion": pl.Int64,
"promotion_description": pl.String,
"discount_amount_including_vat": pl.Float64,
"discount_amount_excluding_vat": pl.Float64,
"discount_amount_vat": pl.Float64,
"discount_amount_currency": pl.String,
"method": pl.String,
"company": pl.String,
"transaction_identifier": pl.Int64,
"total_amount_including_vat": pl.Float64,
"total_amount_excluding_vat": pl.Float64,
"total_amount_vat": pl.Float64,
"total_amount_currency": pl.String,
},
).collect()
assert df.dtypes == df_csv.dtypes
assert df.equals(df_csv)
test_unpacker.test_rename_fields
test_rename_fields() -> None:
Test for polars.Struct field renaming according to provided schema.
Test the following nested JSON content:
{
"text": "foobar",
"json": {
"foo": 0,
"bar": 1
}
}
as described by the following schema:
text=string: String,
json: Struct(
foo=fox: Int64,
bar=bax: Int64
)
which should return:
┌────────┬─────┬─────┐
│ string ┆ fox ┆ bax │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞════════╪═════╪═════╡
│ foobar ┆ 0 ┆ 1 │
└────────┴─────┴─────┘
source
def test_rename_fields() -> None:
"""Test for `polars.Struct` field renaming according to provided schema.
Test the following nested JSON content:
```json
{
"text": "foobar",
"json": {
"foo": 0,
"bar": 1
}
}
```
as described by the following schema:
```
text=string: String,
json: Struct(
foo=fox: Int64,
bar=bax: Int64
)
```
which should return:
```
┌────────┬─────┬─────┐
│ string ┆ fox ┆ bax │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞════════╪═════╪═════╡
│ foobar ┆ 0 ┆ 1 │
└────────┴─────┴─────┘
```
"""
# schema parsing
schema = SchemaParser("text=string:String,json:Struct(foo=fox:Int64,bar=bax:Int64)")
schema.to_struct()
# original dataframe
dtype = pl.Struct(
[
pl.Field("text", pl.String),
pl.Field(
"json",
pl.Struct([pl.Field("foo", pl.Int64), pl.Field("bar", pl.Int64)]),
),
],
)
df = pl.DataFrame(
{
"text": ["foobar"],
"json": [json.loads('{"foo": 0, "bar": 1}')],
},
dtype,
)
# renamed dataframe
dtype_renamed = pl.Struct(
[
pl.Field("string", pl.String),
pl.Field(
"json",
pl.Struct([pl.Field("fox", pl.Int64), pl.Field("bax", pl.Int64)]),
),
],
)
df_renamed = pl.DataFrame(
{
"string": ["foobar"],
"json": [json.loads('{"fox": 0, "bax": 1}')],
},
dtype_renamed,
)
assert (
df.json.unpack(dtype)
.rename(schema.json_paths)
.equals(df_renamed.unnest("json"))
)
test_unpacker.test_struct
test_struct() -> None:
Test a simple polars.Struct containing a few fields.
Test the following nested JSON content:
{
"text": "foobar",
"json": {
"foo": 0,
"bar": 1
}
}
as described by the following schema:
text: String,
json: Struct(
foo: Int64,
bar: Int64
)
source
def test_struct() -> None:
"""Test a simple `polars.Struct` containing a few fields.
Test the following nested JSON content:
```json
{
"text": "foobar",
"json": {
"foo": 0,
"bar": 1
}
}
```
as described by the following schema:
```
text: String,
json: Struct(
foo: Int64,
bar: Int64
)
```
"""
dtype = pl.Struct(
[
pl.Field("text", pl.String),
pl.Field(
"json",
pl.Struct([pl.Field("foo", pl.Int64), pl.Field("bar", pl.Int64)]),
),
],
)
df = pl.DataFrame(
{
"text": ["foobar"],
"json": [json.loads('{"foo": 0, "bar": 1}')],
},
dtype,
)
assert (
SchemaParser("text:String,json:Struct(foo:Int64,bar:Int64)").to_struct()
== dtype
)
assert dtype.to_schema() == df.schema
assert df.json.unpack(dtype).equals(
df.unnest("json").rename({"foo": "json.foo", "bar": "json.bar"}),
)
test_unpacker.test_struct_nested_in_list
test_struct_nested_in_list() -> None:
Test a polars.Struct nested in a polars.List.
Test the following nested JSON content:
{
"text": "foobar",
"json": [
{
"foo": 0,
"bar": 1
},
{
"foo": 2,
"bar": 3
}
]
}
as described by the following schema:
text: String,
json: List(
Struct(
foo: Int64,
bar: Int64
)
)
source
def test_struct_nested_in_list() -> None:
"""Test a `polars.Struct` nested in a `polars.List`.
Test the following nested JSON content:
```json
{
"text": "foobar",
"json": [
{
"foo": 0,
"bar": 1
},
{
"foo": 2,
"bar": 3
}
]
}
```
as described by the following schema:
```
text: String,
json: List(
Struct(
foo: Int64,
bar: Int64
)
)
```
"""
dtype = pl.Struct(
[
pl.Field("text", pl.String),
pl.Field(
"json",
pl.List(
pl.Struct([pl.Field("foo", pl.Int64), pl.Field("bar", pl.Int64)]),
),
),
],
)
df = pl.DataFrame(
{
"text": "foobar",
"json": json.loads('[[{"foo": 0, "bar": 1}, {"foo": 2, "bar": 3}]]'),
},
dtype,
)
assert (
SchemaParser("text:String,json:List(Struct(foo:Int64,bar:Int64))").to_struct()
== dtype
)
assert dtype.to_schema() == df.schema
assert df.json.unpack(dtype).equals(
df.explode("json")
.unnest("json")
.rename({"foo": "json.foo", "bar": "json.bar"}),
)
test_unpacker.test_struct_nested_in_struct
test_struct_nested_in_struct() -> None:
Test a polars.Struct nested within another polars.Struct.
Test the following nested JSON content:
{
"text": "foobar",
"json": {
"foo": {
"fox": 0,
"foz": 2
},
"bar": {
"bax": 1,
"baz": 3
}
}
}
as described by the following schema:
text: String,
json: Struct(
foo: Struct(
fox: Int64,
foz: Int64
),
bar: Struct(
bax: Int64,
baz: Int64
)
)
source
def test_struct_nested_in_struct() -> None:
"""Test a `polars.Struct` nested within another `polars.Struct`.
Test the following nested JSON content:
```json
{
"text": "foobar",
"json": {
"foo": {
"fox": 0,
"foz": 2
},
"bar": {
"bax": 1,
"baz": 3
}
}
}
```
as described by the following schema:
```
text: String,
json: Struct(
foo: Struct(
fox: Int64,
foz: Int64
),
bar: Struct(
bax: Int64,
baz: Int64
)
)
```
"""
# yup, this is why we want this to be generated
dtype = pl.Struct(
[
pl.Field("text", pl.String),
pl.Field(
"json",
pl.Struct(
[
pl.Field(
"foo",
pl.Struct(
[pl.Field("fox", pl.Int64), pl.Field("foz", pl.Int64)],
),
),
pl.Field(
"bar",
pl.Struct(
[pl.Field("bax", pl.Int64), pl.Field("baz", pl.Int64)],
),
),
],
),
),
],
)
df = pl.DataFrame(
{
"text": ["foobar"],
"json": [
json.loads(
'{"foo": {"fox": 0, "foz": 2}, "bar": {"bax": 1, "baz": 3}}',
),
],
},
dtype,
)
assert (
SchemaParser(
"text:String,"
"json:Struct(foo:Struct(fox:Int64,foz:Int64),bar:Struct(bax:Int64,baz:Int64))",
).to_struct()
== dtype
)
assert dtype.to_schema() == df.schema
assert df.json.unpack(dtype).equals(
df.unnest("json")
.unnest("foo", "bar")
.rename(
{
"fox": "json.foo.fox",
"foz": "json.foo.foz",
"bax": "json.bar.bax",
"baz": "json.bar.baz",
},
),
)