Initial commit: 首次建仓，建立目录结构

2026-06-11 23:49:54 +08:00
commit 4038a476b5
9396 changed files with 2372905 additions and 0 deletions
--- a/dashboard/venv/lib/python3.12/site-packages/pandas/io/iceberg.py
+++ b/dashboard/venv/lib/python3.12/site-packages/pandas/io/iceberg.py
@ -0,0 +1,155 @@
+from typing import (
+    Any,
+)
+
+from pandas.compat._optional import import_optional_dependency
+from pandas.util._decorators import set_module
+
+from pandas import DataFrame
+
+
+@set_module("pandas")
+def read_iceberg(
+    table_identifier: str,
+    catalog_name: str | None = None,
+    *,
+    catalog_properties: dict[str, Any] | None = None,
+    columns: list[str] | None = None,
+    row_filter: str | None = None,
+    case_sensitive: bool = True,
+    snapshot_id: int | None = None,
+    limit: int | None = None,
+    scan_properties: dict[str, Any] | None = None,
+) -> DataFrame:
+    """
+    Read an Apache Iceberg table into a pandas DataFrame.
+
+    .. versionadded:: 3.0.0
+
+    .. warning::
+
+       read_iceberg is experimental and may change without warning.
+
+    Parameters
+    ----------
+    table_identifier : str
+        Table identifier.
+    catalog_name : str, optional
+        The name of the catalog.
+    catalog_properties : dict of {str: str}, optional
+        The properties that are used next to the catalog configuration.
+    columns : list of str, optional
+        A list of strings representing the column names to return in the output
+        dataframe.
+    row_filter : str, optional
+        A string that describes the desired rows.
+    case_sensitive : bool, default True
+        If True column matching is case sensitive.
+    snapshot_id : int, optional
+        Snapshot ID to time travel to. By default the table will be scanned as of the
+        current snapshot ID.
+    limit : int, optional
+        An integer representing the number of rows to return in the scan result.
+        By default all matching rows will be fetched.
+    scan_properties : dict of {str: obj}, optional
+        Additional Table properties as a dictionary of string key value pairs to use
+        for this scan.
+
+    Returns
+    -------
+    DataFrame
+        DataFrame based on the Iceberg table.
+
+    See Also
+    --------
+    read_parquet : Read a Parquet file.
+
+    Examples
+    --------
+    >>> df = pd.read_iceberg(
+    ...     table_identifier="my_table",
+    ...     catalog_name="my_catalog",
+    ...     catalog_properties={"s3.secret-access-key": "my-secret"},
+    ...     row_filter="trip_distance >= 10.0",
+    ...     columns=["VendorID", "tpep_pickup_datetime"],
+    ... )  # doctest: +SKIP
+    """
+    pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog")
+    pyiceberg_expressions = import_optional_dependency("pyiceberg.expressions")
+    if catalog_properties is None:
+        catalog_properties = {}
+    catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties)
+    table = catalog.load_table(table_identifier)
+    if row_filter is None:
+        row_filter = pyiceberg_expressions.AlwaysTrue()
+    if columns is None:
+        selected_fields = ("*",)
+    else:
+        selected_fields = tuple(columns)  # type: ignore[assignment]
+    if scan_properties is None:
+        scan_properties = {}
+    result = table.scan(
+        row_filter=row_filter,
+        selected_fields=selected_fields,
+        case_sensitive=case_sensitive,
+        snapshot_id=snapshot_id,
+        options=scan_properties,
+        limit=limit,
+    )
+    return result.to_pandas()
+
+
+def to_iceberg(
+    df: DataFrame,
+    table_identifier: str,
+    catalog_name: str | None = None,
+    *,
+    catalog_properties: dict[str, Any] | None = None,
+    location: str | None = None,
+    append: bool = False,
+    snapshot_properties: dict[str, str] | None = None,
+) -> None:
+    """
+    Write a DataFrame to an Apache Iceberg table.
+
+    .. versionadded:: 3.0.0
+
+    Parameters
+    ----------
+    table_identifier : str
+        Table identifier.
+    catalog_name : str, optional
+        The name of the catalog.
+    catalog_properties : dict of {str: str}, optional
+        The properties that are used next to the catalog configuration.
+    location : str, optional
+        Location for the table.
+    append : bool, default False
+        If ``True``, append data to the table, instead of replacing the content.
+    snapshot_properties : dict of {str: str}, optional
+        Custom properties to be added to the snapshot summary
+
+    See Also
+    --------
+    read_iceberg : Read an Apache Iceberg table.
+    DataFrame.to_parquet : Write a DataFrame in Parquet format.
+    """
+    pa = import_optional_dependency("pyarrow")
+    pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog")
+    if catalog_properties is None:
+        catalog_properties = {}
+    catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties)
+    arrow_table = pa.Table.from_pandas(df)
+    table = catalog.create_table_if_not_exists(
+        identifier=table_identifier,
+        schema=arrow_table.schema,
+        location=location,
+        # we could add `partition_spec`, `sort_order` and `properties` in the
+        # future, but it may not be trivial without exposing PyIceberg objects
+    )
+    if snapshot_properties is None:
+        snapshot_properties = {}
+    if append:
+        table.append(arrow_table, snapshot_properties=snapshot_properties)
+    else:
+        table.overwrite(arrow_table, snapshot_properties=snapshot_properties)