Initial commit: 首次建仓,建立目录结构
This commit is contained in:
155
dashboard/venv/lib/python3.12/site-packages/pandas/io/iceberg.py
Normal file
155
dashboard/venv/lib/python3.12/site-packages/pandas/io/iceberg.py
Normal file
@ -0,0 +1,155 @@
|
||||
from typing import (
|
||||
Any,
|
||||
)
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._decorators import set_module
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
@set_module("pandas")
|
||||
def read_iceberg(
|
||||
table_identifier: str,
|
||||
catalog_name: str | None = None,
|
||||
*,
|
||||
catalog_properties: dict[str, Any] | None = None,
|
||||
columns: list[str] | None = None,
|
||||
row_filter: str | None = None,
|
||||
case_sensitive: bool = True,
|
||||
snapshot_id: int | None = None,
|
||||
limit: int | None = None,
|
||||
scan_properties: dict[str, Any] | None = None,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Read an Apache Iceberg table into a pandas DataFrame.
|
||||
|
||||
.. versionadded:: 3.0.0
|
||||
|
||||
.. warning::
|
||||
|
||||
read_iceberg is experimental and may change without warning.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table_identifier : str
|
||||
Table identifier.
|
||||
catalog_name : str, optional
|
||||
The name of the catalog.
|
||||
catalog_properties : dict of {str: str}, optional
|
||||
The properties that are used next to the catalog configuration.
|
||||
columns : list of str, optional
|
||||
A list of strings representing the column names to return in the output
|
||||
dataframe.
|
||||
row_filter : str, optional
|
||||
A string that describes the desired rows.
|
||||
case_sensitive : bool, default True
|
||||
If True column matching is case sensitive.
|
||||
snapshot_id : int, optional
|
||||
Snapshot ID to time travel to. By default the table will be scanned as of the
|
||||
current snapshot ID.
|
||||
limit : int, optional
|
||||
An integer representing the number of rows to return in the scan result.
|
||||
By default all matching rows will be fetched.
|
||||
scan_properties : dict of {str: obj}, optional
|
||||
Additional Table properties as a dictionary of string key value pairs to use
|
||||
for this scan.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
DataFrame based on the Iceberg table.
|
||||
|
||||
See Also
|
||||
--------
|
||||
read_parquet : Read a Parquet file.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.read_iceberg(
|
||||
... table_identifier="my_table",
|
||||
... catalog_name="my_catalog",
|
||||
... catalog_properties={"s3.secret-access-key": "my-secret"},
|
||||
... row_filter="trip_distance >= 10.0",
|
||||
... columns=["VendorID", "tpep_pickup_datetime"],
|
||||
... ) # doctest: +SKIP
|
||||
"""
|
||||
pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog")
|
||||
pyiceberg_expressions = import_optional_dependency("pyiceberg.expressions")
|
||||
if catalog_properties is None:
|
||||
catalog_properties = {}
|
||||
catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties)
|
||||
table = catalog.load_table(table_identifier)
|
||||
if row_filter is None:
|
||||
row_filter = pyiceberg_expressions.AlwaysTrue()
|
||||
if columns is None:
|
||||
selected_fields = ("*",)
|
||||
else:
|
||||
selected_fields = tuple(columns) # type: ignore[assignment]
|
||||
if scan_properties is None:
|
||||
scan_properties = {}
|
||||
result = table.scan(
|
||||
row_filter=row_filter,
|
||||
selected_fields=selected_fields,
|
||||
case_sensitive=case_sensitive,
|
||||
snapshot_id=snapshot_id,
|
||||
options=scan_properties,
|
||||
limit=limit,
|
||||
)
|
||||
return result.to_pandas()
|
||||
|
||||
|
||||
def to_iceberg(
|
||||
df: DataFrame,
|
||||
table_identifier: str,
|
||||
catalog_name: str | None = None,
|
||||
*,
|
||||
catalog_properties: dict[str, Any] | None = None,
|
||||
location: str | None = None,
|
||||
append: bool = False,
|
||||
snapshot_properties: dict[str, str] | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Write a DataFrame to an Apache Iceberg table.
|
||||
|
||||
.. versionadded:: 3.0.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table_identifier : str
|
||||
Table identifier.
|
||||
catalog_name : str, optional
|
||||
The name of the catalog.
|
||||
catalog_properties : dict of {str: str}, optional
|
||||
The properties that are used next to the catalog configuration.
|
||||
location : str, optional
|
||||
Location for the table.
|
||||
append : bool, default False
|
||||
If ``True``, append data to the table, instead of replacing the content.
|
||||
snapshot_properties : dict of {str: str}, optional
|
||||
Custom properties to be added to the snapshot summary
|
||||
|
||||
See Also
|
||||
--------
|
||||
read_iceberg : Read an Apache Iceberg table.
|
||||
DataFrame.to_parquet : Write a DataFrame in Parquet format.
|
||||
"""
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog")
|
||||
if catalog_properties is None:
|
||||
catalog_properties = {}
|
||||
catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties)
|
||||
arrow_table = pa.Table.from_pandas(df)
|
||||
table = catalog.create_table_if_not_exists(
|
||||
identifier=table_identifier,
|
||||
schema=arrow_table.schema,
|
||||
location=location,
|
||||
# we could add `partition_spec`, `sort_order` and `properties` in the
|
||||
# future, but it may not be trivial without exposing PyIceberg objects
|
||||
)
|
||||
if snapshot_properties is None:
|
||||
snapshot_properties = {}
|
||||
if append:
|
||||
table.append(arrow_table, snapshot_properties=snapshot_properties)
|
||||
else:
|
||||
table.overwrite(arrow_table, snapshot_properties=snapshot_properties)
|
||||
Reference in New Issue
Block a user