Initial commit: 首次建仓,建立目录结构
This commit is contained in:
@ -0,0 +1,9 @@
|
||||
from pandas.io.parsers.readers import (
|
||||
TextFileReader,
|
||||
TextParser,
|
||||
read_csv,
|
||||
read_fwf,
|
||||
read_table,
|
||||
)
|
||||
|
||||
__all__ = ["TextFileReader", "TextParser", "read_csv", "read_fwf", "read_table"]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,328 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import (
|
||||
Pandas4Warning,
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
from pandas.util._exceptions import (
|
||||
find_stack_level,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
pandas_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.inference import is_integer
|
||||
|
||||
from pandas.io._util import arrow_table_to_pandas
|
||||
from pandas.io.parsers.base_parser import ParserBase
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pyarrow as pa
|
||||
|
||||
from pandas._typing import ReadBuffer
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
class ArrowParserWrapper(ParserBase):
|
||||
"""
|
||||
Wrapper for the pyarrow engine for read_csv()
|
||||
"""
|
||||
|
||||
def __init__(self, src: ReadBuffer[bytes], **kwds) -> None:
|
||||
super().__init__(kwds)
|
||||
self.kwds = kwds
|
||||
self.src = src
|
||||
|
||||
self._parse_kwds()
|
||||
|
||||
def _parse_kwds(self) -> None:
|
||||
"""
|
||||
Validates keywords before passing to pyarrow.
|
||||
"""
|
||||
encoding: str | None = self.kwds.get("encoding")
|
||||
self.encoding = "utf-8" if encoding is None else encoding
|
||||
|
||||
na_values = self.kwds["na_values"]
|
||||
if isinstance(na_values, dict):
|
||||
raise ValueError(
|
||||
"The pyarrow engine doesn't support passing a dict for na_values"
|
||||
)
|
||||
self.na_values = list(self.kwds["na_values"])
|
||||
|
||||
def _get_pyarrow_options(self) -> None:
|
||||
"""
|
||||
Rename some arguments to pass to pyarrow
|
||||
"""
|
||||
mapping = {
|
||||
"usecols": "include_columns",
|
||||
"na_values": "null_values",
|
||||
"escapechar": "escape_char",
|
||||
"skip_blank_lines": "ignore_empty_lines",
|
||||
"decimal": "decimal_point",
|
||||
"quotechar": "quote_char",
|
||||
}
|
||||
for pandas_name, pyarrow_name in mapping.items():
|
||||
if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None:
|
||||
self.kwds[pyarrow_name] = self.kwds.pop(pandas_name)
|
||||
|
||||
# Date format handling
|
||||
# If we get a string, we need to convert it into a list for pyarrow
|
||||
# If we get a dict, we want to parse those separately
|
||||
date_format = self.date_format
|
||||
if isinstance(date_format, str):
|
||||
date_format = [date_format]
|
||||
else:
|
||||
# In case of dict, we don't want to propagate through, so
|
||||
# just set to pyarrow default of None
|
||||
|
||||
# Ideally, in future we disable pyarrow dtype inference (read in as string)
|
||||
# to prevent misreads.
|
||||
date_format = None
|
||||
self.kwds["timestamp_parsers"] = date_format
|
||||
|
||||
self.parse_options = {
|
||||
option_name: option_value
|
||||
for option_name, option_value in self.kwds.items()
|
||||
if option_value is not None
|
||||
and option_name
|
||||
in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
|
||||
}
|
||||
|
||||
on_bad_lines = self.kwds.get("on_bad_lines")
|
||||
if on_bad_lines is not None:
|
||||
if callable(on_bad_lines):
|
||||
self.parse_options["invalid_row_handler"] = on_bad_lines
|
||||
elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR:
|
||||
self.parse_options["invalid_row_handler"] = (
|
||||
None # PyArrow raises an exception by default
|
||||
)
|
||||
elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN:
|
||||
|
||||
def handle_warning(invalid_row) -> str:
|
||||
warnings.warn(
|
||||
f"Expected {invalid_row.expected_columns} columns, but found "
|
||||
f"{invalid_row.actual_columns}: {invalid_row.text}",
|
||||
ParserWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
return "skip"
|
||||
|
||||
self.parse_options["invalid_row_handler"] = handle_warning
|
||||
elif on_bad_lines == ParserBase.BadLineHandleMethod.SKIP:
|
||||
self.parse_options["invalid_row_handler"] = lambda _: "skip"
|
||||
|
||||
self.convert_options = {
|
||||
option_name: option_value
|
||||
for option_name, option_value in self.kwds.items()
|
||||
if option_value is not None
|
||||
and option_name
|
||||
in (
|
||||
"include_columns",
|
||||
"null_values",
|
||||
"true_values",
|
||||
"false_values",
|
||||
"decimal_point",
|
||||
"timestamp_parsers",
|
||||
)
|
||||
}
|
||||
self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"]
|
||||
# autogenerated column names are prefixed with 'f' in pyarrow.csv
|
||||
if self.header is None and "include_columns" in self.convert_options:
|
||||
self.convert_options["include_columns"] = [
|
||||
f"f{n}" for n in self.convert_options["include_columns"]
|
||||
]
|
||||
|
||||
self.read_options = {
|
||||
"autogenerate_column_names": self.header is None,
|
||||
"skip_rows": self.header
|
||||
if self.header is not None
|
||||
else self.kwds["skiprows"],
|
||||
"encoding": self.encoding,
|
||||
}
|
||||
|
||||
def _get_convert_options(self):
|
||||
pyarrow_csv = import_optional_dependency("pyarrow.csv")
|
||||
|
||||
try:
|
||||
convert_options = pyarrow_csv.ConvertOptions(**self.convert_options)
|
||||
except TypeError as err:
|
||||
include = self.convert_options.get("include_columns", None)
|
||||
if include is not None:
|
||||
self._validate_usecols(include)
|
||||
|
||||
nulls = self.convert_options.get("null_values", set())
|
||||
if not lib.is_list_like(nulls) or not all(
|
||||
isinstance(x, str) for x in nulls
|
||||
):
|
||||
raise TypeError(
|
||||
"The 'pyarrow' engine requires all na_values to be strings"
|
||||
) from err
|
||||
|
||||
raise
|
||||
|
||||
return convert_options
|
||||
|
||||
def _adjust_column_names(self, table: pa.Table) -> bool:
|
||||
num_cols = len(table.columns)
|
||||
multi_index_named = True
|
||||
if self.header is None:
|
||||
if self.names is None:
|
||||
self.names = range(num_cols)
|
||||
if len(self.names) != num_cols:
|
||||
# usecols is passed through to pyarrow, we only handle index col here
|
||||
# The only way self.names is not the same length as number of cols is
|
||||
# if we have int index_col. We should just pad the names(they will get
|
||||
# removed anyways) to expected length then.
|
||||
columns_prefix = [str(x) for x in range(num_cols - len(self.names))]
|
||||
self.names = columns_prefix + self.names
|
||||
multi_index_named = False
|
||||
return multi_index_named
|
||||
|
||||
def _finalize_index(self, frame: DataFrame, multi_index_named: bool) -> DataFrame:
|
||||
if self.index_col is not None:
|
||||
index_to_set = self.index_col.copy()
|
||||
for i, item in enumerate(self.index_col):
|
||||
if is_integer(item):
|
||||
index_to_set[i] = frame.columns[item]
|
||||
# String case
|
||||
elif item not in frame.columns:
|
||||
raise ValueError(f"Index {item} invalid")
|
||||
|
||||
# Process dtype for index_col and drop from dtypes
|
||||
if self.dtype is not None:
|
||||
key, new_dtype = (
|
||||
(item, self.dtype.get(item))
|
||||
if self.dtype.get(item) is not None
|
||||
else (frame.columns[item], self.dtype.get(frame.columns[item]))
|
||||
)
|
||||
if new_dtype is not None:
|
||||
frame[key] = frame[key].astype(new_dtype)
|
||||
del self.dtype[key]
|
||||
|
||||
frame.set_index(index_to_set, drop=True, inplace=True)
|
||||
# Clear names if headerless and no name given
|
||||
if self.header is None and not multi_index_named:
|
||||
frame.index.names = [None] * len(frame.index.names)
|
||||
|
||||
return frame
|
||||
|
||||
def _finalize_dtype(self, frame: DataFrame) -> DataFrame:
|
||||
if self.dtype is not None:
|
||||
# Ignore non-existent columns from dtype mapping
|
||||
# like other parsers do
|
||||
if isinstance(self.dtype, dict):
|
||||
self.dtype = {
|
||||
k: pandas_dtype(v)
|
||||
for k, v in self.dtype.items()
|
||||
if k in frame.columns
|
||||
}
|
||||
else:
|
||||
self.dtype = pandas_dtype(self.dtype)
|
||||
try:
|
||||
frame = frame.astype(self.dtype)
|
||||
except TypeError as err:
|
||||
# GH#44901 reraise to keep api consistent
|
||||
raise ValueError(str(err)) from err
|
||||
return frame
|
||||
|
||||
def _finalize_pandas_output(
|
||||
self, frame: DataFrame, multi_index_named: bool
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Processes data read in based on kwargs.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
frame : DataFrame
|
||||
The DataFrame to process.
|
||||
multi_index_named : bool
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The processed DataFrame.
|
||||
"""
|
||||
frame = self._do_date_conversions(frame.columns, frame)
|
||||
frame = self._finalize_index(frame, multi_index_named)
|
||||
frame = self._finalize_dtype(frame)
|
||||
return frame
|
||||
|
||||
def _validate_usecols(self, usecols) -> None:
|
||||
if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols):
|
||||
raise ValueError(
|
||||
"The pyarrow engine does not allow 'usecols' to be integer "
|
||||
"column positions. Pass a list of string column names instead."
|
||||
)
|
||||
elif callable(usecols):
|
||||
raise ValueError(
|
||||
"The pyarrow engine does not allow 'usecols' to be a callable."
|
||||
)
|
||||
|
||||
def read(self) -> DataFrame:
|
||||
"""
|
||||
Reads the contents of a CSV file into a DataFrame and
|
||||
processes it according to the kwargs passed in the
|
||||
constructor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The DataFrame created from the CSV file.
|
||||
"""
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
pyarrow_csv = import_optional_dependency("pyarrow.csv")
|
||||
self._get_pyarrow_options()
|
||||
convert_options = self._get_convert_options()
|
||||
|
||||
try:
|
||||
table = pyarrow_csv.read_csv(
|
||||
self.src,
|
||||
read_options=pyarrow_csv.ReadOptions(**self.read_options),
|
||||
parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
|
||||
convert_options=convert_options,
|
||||
)
|
||||
except pa.ArrowInvalid as e:
|
||||
raise ParserError(e) from e
|
||||
|
||||
dtype_backend = self.kwds["dtype_backend"]
|
||||
|
||||
# Convert all pa.null() cols -> float64 (non nullable)
|
||||
# else Int64 (nullable case, see below)
|
||||
if dtype_backend is lib.no_default:
|
||||
new_schema = table.schema
|
||||
new_type = pa.float64()
|
||||
for i, arrow_type in enumerate(table.schema.types):
|
||||
if pa.types.is_null(arrow_type):
|
||||
new_schema = new_schema.set(
|
||||
i, new_schema.field(i).with_type(new_type)
|
||||
)
|
||||
|
||||
table = table.cast(new_schema)
|
||||
|
||||
multi_index_named = self._adjust_column_names(table)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
"make_block is deprecated",
|
||||
Pandas4Warning,
|
||||
)
|
||||
frame = arrow_table_to_pandas(
|
||||
table,
|
||||
dtype_backend=dtype_backend,
|
||||
null_to_int64=True,
|
||||
dtype=self.dtype,
|
||||
names=self.names,
|
||||
)
|
||||
|
||||
if self.header is None:
|
||||
frame.columns = self.names
|
||||
|
||||
return self._finalize_pandas_output(frame, multi_index_named)
|
||||
@ -0,0 +1,997 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
from copy import copy
|
||||
import csv
|
||||
from enum import Enum
|
||||
import itertools
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
cast,
|
||||
final,
|
||||
overload,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
parsers,
|
||||
)
|
||||
import pandas._libs.ops as libops
|
||||
from pandas._libs.parsers import STR_NA_VALUES
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import (
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_bool_dtype,
|
||||
is_dict_like,
|
||||
is_float_dtype,
|
||||
is_integer,
|
||||
is_integer_dtype,
|
||||
is_list_like,
|
||||
is_object_dtype,
|
||||
is_string_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.missing import isna
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
StringDtype,
|
||||
)
|
||||
from pandas.core import algorithms
|
||||
from pandas.core.arrays import (
|
||||
ArrowExtensionArray,
|
||||
BaseMaskedArray,
|
||||
BooleanArray,
|
||||
FloatingArray,
|
||||
IntegerArray,
|
||||
)
|
||||
from pandas.core.indexes.api import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
default_index,
|
||||
ensure_index_from_sequences,
|
||||
)
|
||||
from pandas.core.series import Series
|
||||
from pandas.core.tools import datetimes as tools
|
||||
|
||||
from pandas.io.common import is_potential_multi_index
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Callable,
|
||||
Iterable,
|
||||
Mapping,
|
||||
Sequence,
|
||||
)
|
||||
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
DtypeArg,
|
||||
Hashable,
|
||||
HashableT,
|
||||
Scalar,
|
||||
SequenceT,
|
||||
)
|
||||
|
||||
|
||||
class ParserBase:
|
||||
class BadLineHandleMethod(Enum):
|
||||
ERROR = 0
|
||||
WARN = 1
|
||||
SKIP = 2
|
||||
|
||||
_implicit_index: bool
|
||||
_first_chunk: bool
|
||||
keep_default_na: bool
|
||||
dayfirst: bool
|
||||
cache_dates: bool
|
||||
usecols_dtype: str | None
|
||||
|
||||
def __init__(self, kwds) -> None:
|
||||
self._implicit_index = False
|
||||
|
||||
self.names = kwds.get("names")
|
||||
self.orig_names: Sequence[Hashable] | None = None
|
||||
|
||||
self.index_col = kwds.get("index_col", None)
|
||||
self.unnamed_cols: set = set()
|
||||
self.index_names: Sequence[Hashable] | None = None
|
||||
self.col_names: Sequence[Hashable] | None = None
|
||||
|
||||
parse_dates = kwds.pop("parse_dates", False)
|
||||
if parse_dates is None or lib.is_bool(parse_dates):
|
||||
parse_dates = bool(parse_dates)
|
||||
elif not isinstance(parse_dates, list):
|
||||
raise TypeError(
|
||||
"Only booleans and lists are accepted for the 'parse_dates' parameter"
|
||||
)
|
||||
self.parse_dates: bool | list = parse_dates
|
||||
self.date_parser = kwds.pop("date_parser", lib.no_default)
|
||||
self.date_format = kwds.pop("date_format", None)
|
||||
self.dayfirst = kwds.pop("dayfirst", False)
|
||||
|
||||
self.na_values = kwds.get("na_values")
|
||||
self.na_fvalues = kwds.get("na_fvalues")
|
||||
self.na_filter = kwds.get("na_filter", False)
|
||||
self.keep_default_na = kwds.get("keep_default_na", True)
|
||||
|
||||
self.dtype = copy(kwds.get("dtype", None))
|
||||
self.converters = kwds.get("converters")
|
||||
self.dtype_backend = kwds.get("dtype_backend")
|
||||
|
||||
self.true_values = kwds.get("true_values")
|
||||
self.false_values = kwds.get("false_values")
|
||||
self.cache_dates = kwds.pop("cache_dates", True)
|
||||
|
||||
# validate header options for mi
|
||||
self.header = kwds.get("header")
|
||||
if is_list_like(self.header, allow_sets=False):
|
||||
if kwds.get("usecols"):
|
||||
raise ValueError(
|
||||
"cannot specify usecols when specifying a multi-index header"
|
||||
)
|
||||
if kwds.get("names"):
|
||||
raise ValueError(
|
||||
"cannot specify names when specifying a multi-index header"
|
||||
)
|
||||
|
||||
# validate index_col that only contains integers
|
||||
if self.index_col is not None:
|
||||
# In this case we can pin down index_col as list[int]
|
||||
if is_integer(self.index_col):
|
||||
self.index_col = [self.index_col]
|
||||
elif not (
|
||||
is_list_like(self.index_col, allow_sets=False)
|
||||
and all(map(is_integer, self.index_col))
|
||||
):
|
||||
raise ValueError(
|
||||
"index_col must only contain integers of column positions "
|
||||
"when specifying a multi-index header"
|
||||
)
|
||||
else:
|
||||
self.index_col = list(self.index_col)
|
||||
|
||||
self._first_chunk = True
|
||||
|
||||
self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
|
||||
|
||||
# Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)
|
||||
# Normally, this arg would get pre-processed earlier on
|
||||
self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)
|
||||
|
||||
def close(self) -> None:
|
||||
pass
|
||||
|
||||
@final
|
||||
def _should_parse_dates(self, i: int) -> bool:
|
||||
if isinstance(self.parse_dates, bool):
|
||||
return self.parse_dates
|
||||
else:
|
||||
if self.index_names is not None:
|
||||
name = self.index_names[i]
|
||||
else:
|
||||
name = None
|
||||
j = i if self.index_col is None else self.index_col[i]
|
||||
|
||||
return (j in self.parse_dates) or (
|
||||
name is not None and name in self.parse_dates
|
||||
)
|
||||
|
||||
@final
|
||||
def _extract_multi_indexer_columns(
|
||||
self,
|
||||
header,
|
||||
index_names: Sequence[Hashable] | None,
|
||||
passed_names: bool = False,
|
||||
) -> tuple[
|
||||
Sequence[Hashable], Sequence[Hashable] | None, Sequence[Hashable] | None, bool
|
||||
]:
|
||||
"""
|
||||
Extract and return the names, index_names, col_names if the column
|
||||
names are a MultiIndex.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
header: list of lists
|
||||
The header rows
|
||||
index_names: list, optional
|
||||
The names of the future index
|
||||
passed_names: bool, default False
|
||||
A flag specifying if names where passed
|
||||
|
||||
"""
|
||||
if len(header) < 2:
|
||||
return header[0], index_names, None, passed_names
|
||||
|
||||
# the names are the tuples of the header that are not the index cols
|
||||
# 0 is the name of the index, assuming index_col is a list of column
|
||||
# numbers
|
||||
ic = self.index_col
|
||||
if ic is None:
|
||||
ic = []
|
||||
|
||||
if not isinstance(ic, (list, tuple, np.ndarray)):
|
||||
ic = [ic]
|
||||
sic = set(ic)
|
||||
|
||||
# clean the index_names
|
||||
index_names = header.pop(-1)
|
||||
index_names, _, _ = self._clean_index_names(index_names, self.index_col)
|
||||
|
||||
# extract the columns
|
||||
field_count = len(header[0])
|
||||
|
||||
# check if header lengths are equal
|
||||
if not all(len(header_iter) == field_count for header_iter in header[1:]):
|
||||
raise ParserError("Header rows must have an equal number of columns.")
|
||||
|
||||
def extract(r):
|
||||
return tuple(r[i] for i in range(field_count) if i not in sic)
|
||||
|
||||
columns = list(zip(*(extract(r) for r in header), strict=True))
|
||||
names = columns.copy()
|
||||
for single_ic in sorted(ic):
|
||||
names.insert(single_ic, single_ic)
|
||||
|
||||
# Clean the column names (if we have an index_col).
|
||||
if ic:
|
||||
col_names = [
|
||||
r[ic[0]]
|
||||
if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols)
|
||||
else None
|
||||
for r in header
|
||||
]
|
||||
else:
|
||||
col_names = [None] * len(header)
|
||||
|
||||
passed_names = True
|
||||
|
||||
return names, index_names, col_names, passed_names
|
||||
|
||||
@final
|
||||
def _maybe_make_multi_index_columns(
|
||||
self,
|
||||
columns: SequenceT,
|
||||
col_names: Sequence[Hashable] | None = None,
|
||||
) -> SequenceT | MultiIndex:
|
||||
# possibly create a column mi here
|
||||
if is_potential_multi_index(columns):
|
||||
columns_mi = cast("Sequence[tuple[Hashable, ...]]", columns)
|
||||
return MultiIndex.from_tuples(columns_mi, names=col_names)
|
||||
return columns
|
||||
|
||||
@final
|
||||
def _make_index(
|
||||
self, alldata, columns, indexnamerow: list[Scalar] | None = None
|
||||
) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:
|
||||
index: Index | None
|
||||
if isinstance(self.index_col, list) and len(self.index_col):
|
||||
to_remove = []
|
||||
indexes = []
|
||||
for idx in self.index_col:
|
||||
if isinstance(idx, str):
|
||||
raise ValueError(f"Index {idx} invalid")
|
||||
to_remove.append(idx)
|
||||
indexes.append(alldata[idx])
|
||||
# remove index items from content and columns, don't pop in
|
||||
# loop
|
||||
for i in sorted(to_remove, reverse=True):
|
||||
alldata.pop(i)
|
||||
if not self._implicit_index:
|
||||
columns.pop(i)
|
||||
index = self._agg_index(indexes)
|
||||
|
||||
# add names for the index
|
||||
if indexnamerow:
|
||||
coffset = len(indexnamerow) - len(columns)
|
||||
index = index.set_names(indexnamerow[:coffset])
|
||||
else:
|
||||
index = None
|
||||
|
||||
# maybe create a mi on the columns
|
||||
columns = self._maybe_make_multi_index_columns(columns, self.col_names)
|
||||
|
||||
return index, columns
|
||||
|
||||
@final
|
||||
def _clean_mapping(self, mapping):
|
||||
"""converts col numbers to names"""
|
||||
if not isinstance(mapping, dict):
|
||||
return mapping
|
||||
clean = {}
|
||||
# for mypy
|
||||
assert self.orig_names is not None
|
||||
|
||||
for col, v in mapping.items():
|
||||
if isinstance(col, int) and col not in self.orig_names:
|
||||
col = self.orig_names[col]
|
||||
clean[col] = v
|
||||
if isinstance(mapping, defaultdict):
|
||||
remaining_cols = set(self.orig_names) - set(clean.keys())
|
||||
clean.update({col: mapping[col] for col in remaining_cols})
|
||||
return clean
|
||||
|
||||
@final
|
||||
def _agg_index(self, index) -> Index:
|
||||
arrays = []
|
||||
converters = self._clean_mapping(self.converters)
|
||||
clean_dtypes = self._clean_mapping(self.dtype)
|
||||
|
||||
if self.index_names is not None:
|
||||
names: Iterable = self.index_names
|
||||
zip_strict = True
|
||||
else:
|
||||
names = itertools.cycle([None])
|
||||
zip_strict = False
|
||||
for i, (arr, name) in enumerate(zip(index, names, strict=zip_strict)):
|
||||
if self._should_parse_dates(i):
|
||||
arr = date_converter(
|
||||
arr,
|
||||
col=self.index_names[i] if self.index_names is not None else None,
|
||||
dayfirst=self.dayfirst,
|
||||
cache_dates=self.cache_dates,
|
||||
date_format=self.date_format,
|
||||
)
|
||||
|
||||
if self.na_filter:
|
||||
col_na_values = self.na_values
|
||||
col_na_fvalues = self.na_fvalues
|
||||
else:
|
||||
col_na_values = set()
|
||||
col_na_fvalues = set()
|
||||
|
||||
if isinstance(self.na_values, dict):
|
||||
assert self.index_names is not None
|
||||
col_name = self.index_names[i]
|
||||
if col_name is not None:
|
||||
col_na_values, col_na_fvalues = get_na_values(
|
||||
col_name, self.na_values, self.na_fvalues, self.keep_default_na
|
||||
)
|
||||
else:
|
||||
col_na_values, col_na_fvalues = set(), set()
|
||||
|
||||
cast_type = None
|
||||
index_converter = False
|
||||
if self.index_names is not None:
|
||||
if isinstance(clean_dtypes, dict):
|
||||
cast_type = clean_dtypes.get(self.index_names[i], None)
|
||||
|
||||
if isinstance(converters, dict):
|
||||
index_converter = converters.get(self.index_names[i]) is not None
|
||||
|
||||
try_num_bool = not (
|
||||
(cast_type and is_string_dtype(cast_type)) or index_converter
|
||||
)
|
||||
|
||||
arr, _ = self._infer_types(
|
||||
arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool
|
||||
)
|
||||
if cast_type is not None:
|
||||
# Don't perform RangeIndex inference
|
||||
idx = Index(arr, name=name, dtype=cast_type, copy=False)
|
||||
else:
|
||||
idx = ensure_index_from_sequences([arr], [name])
|
||||
arrays.append(idx)
|
||||
|
||||
if len(arrays) == 1:
|
||||
return arrays[0]
|
||||
else:
|
||||
return MultiIndex.from_arrays(arrays)
|
||||
|
||||
@final
|
||||
def _set_noconvert_dtype_columns(
|
||||
self, col_indices: list[int], names: Sequence[Hashable]
|
||||
) -> set[int]:
|
||||
"""
|
||||
Set the columns that should not undergo dtype conversions.
|
||||
|
||||
Currently, any column that is involved with date parsing will not
|
||||
undergo such conversions. If usecols is specified, the positions of the columns
|
||||
not to cast is relative to the usecols not to all columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
col_indices: The indices specifying order and positions of the columns
|
||||
names: The column names which order is corresponding with the order
|
||||
of col_indices
|
||||
|
||||
Returns
|
||||
-------
|
||||
A set of integers containing the positions of the columns not to convert.
|
||||
"""
|
||||
usecols: list[int] | list[str] | None
|
||||
noconvert_columns = set()
|
||||
if self.usecols_dtype == "integer":
|
||||
# A set of integers will be converted to a list in
|
||||
# the correct order every single time.
|
||||
usecols = sorted(self.usecols)
|
||||
elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
|
||||
# The names attribute should have the correct columns
|
||||
# in the proper order for indexing with parse_dates.
|
||||
usecols = col_indices
|
||||
else:
|
||||
# Usecols is empty.
|
||||
usecols = None
|
||||
|
||||
def _set(x) -> int:
|
||||
if usecols is not None and is_integer(x):
|
||||
x = usecols[x]
|
||||
|
||||
if not is_integer(x):
|
||||
x = col_indices[names.index(x)]
|
||||
|
||||
return x
|
||||
|
||||
if isinstance(self.parse_dates, list):
|
||||
validate_parse_dates_presence(self.parse_dates, names)
|
||||
for val in self.parse_dates:
|
||||
noconvert_columns.add(_set(val))
|
||||
|
||||
elif self.parse_dates:
|
||||
if isinstance(self.index_col, list):
|
||||
for k in self.index_col:
|
||||
noconvert_columns.add(_set(k))
|
||||
elif self.index_col is not None:
|
||||
noconvert_columns.add(_set(self.index_col))
|
||||
|
||||
return noconvert_columns
|
||||
|
||||
@final
|
||||
def _infer_types(
|
||||
self, values, na_values, no_dtype_specified, try_num_bool: bool = True
|
||||
) -> tuple[ArrayLike, int]:
|
||||
"""
|
||||
Infer types of values, possibly casting
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : ndarray
|
||||
na_values : set
|
||||
no_dtype_specified: Specifies if we want to cast explicitly
|
||||
try_num_bool : bool, default try
|
||||
try to cast values to numeric (first preference) or boolean
|
||||
|
||||
Returns
|
||||
-------
|
||||
converted : ndarray or ExtensionArray
|
||||
na_count : int
|
||||
"""
|
||||
na_count = 0
|
||||
if issubclass(values.dtype.type, (np.number, np.bool_)):
|
||||
# If our array has numeric dtype, we don't have to check for strings in isin
|
||||
na_values = np.array([val for val in na_values if not isinstance(val, str)])
|
||||
mask = algorithms.isin(values, na_values)
|
||||
na_count = mask.astype("uint8", copy=False).sum()
|
||||
if na_count > 0:
|
||||
if is_integer_dtype(values):
|
||||
values = values.astype(np.float64)
|
||||
np.putmask(values, mask, np.nan)
|
||||
return values, na_count
|
||||
|
||||
dtype_backend = self.dtype_backend
|
||||
non_default_dtype_backend = (
|
||||
no_dtype_specified and dtype_backend is not lib.no_default
|
||||
)
|
||||
result: ArrayLike
|
||||
|
||||
if try_num_bool and is_object_dtype(values.dtype):
|
||||
# exclude e.g DatetimeIndex here
|
||||
try:
|
||||
result, result_mask = lib.maybe_convert_numeric(
|
||||
values,
|
||||
na_values,
|
||||
False,
|
||||
convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type]
|
||||
)
|
||||
except (ValueError, TypeError):
|
||||
# e.g. encountering datetime string gets ValueError
|
||||
# TypeError can be raised in floatify
|
||||
na_count = parsers.sanitize_objects(values, na_values)
|
||||
result = values
|
||||
else:
|
||||
if non_default_dtype_backend:
|
||||
if result_mask is None:
|
||||
result_mask = np.zeros(result.shape, dtype=np.bool_)
|
||||
|
||||
if result_mask.all():
|
||||
result = IntegerArray(
|
||||
np.ones(result_mask.shape, dtype=np.int64), result_mask
|
||||
)
|
||||
elif is_integer_dtype(result):
|
||||
result = IntegerArray(result, result_mask)
|
||||
elif is_bool_dtype(result):
|
||||
result = BooleanArray(result, result_mask)
|
||||
elif is_float_dtype(result):
|
||||
result = FloatingArray(result, result_mask)
|
||||
|
||||
na_count = result_mask.sum()
|
||||
else:
|
||||
na_count = isna(result).sum()
|
||||
else:
|
||||
result = values
|
||||
if values.dtype == np.object_:
|
||||
na_count = parsers.sanitize_objects(values, na_values)
|
||||
|
||||
if (
|
||||
result.dtype == np.object_
|
||||
and try_num_bool
|
||||
and (len(result) == 0 or not isinstance(result[0], int))
|
||||
):
|
||||
result, bool_mask = libops.maybe_convert_bool(
|
||||
np.asarray(values),
|
||||
true_values=self.true_values,
|
||||
false_values=self.false_values,
|
||||
convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type]
|
||||
)
|
||||
if result.dtype == np.bool_ and non_default_dtype_backend:
|
||||
if bool_mask is None:
|
||||
bool_mask = np.zeros(result.shape, dtype=np.bool_)
|
||||
result = BooleanArray(result, bool_mask)
|
||||
elif result.dtype == np.object_ and non_default_dtype_backend:
|
||||
# read_excel sends array of datetime objects
|
||||
if not lib.is_datetime_array(result, skipna=True):
|
||||
dtype = StringDtype()
|
||||
cls = dtype.construct_array_type()
|
||||
result = cls._from_sequence(values, dtype=dtype)
|
||||
|
||||
if dtype_backend == "pyarrow":
|
||||
pa = import_optional_dependency("pyarrow")
|
||||
if isinstance(result, np.ndarray):
|
||||
result = ArrowExtensionArray(pa.array(result, from_pandas=True))
|
||||
elif isinstance(result, BaseMaskedArray):
|
||||
if result._mask.all():
|
||||
# We want an arrow null array here
|
||||
result = ArrowExtensionArray(pa.array([None] * len(result)))
|
||||
else:
|
||||
result = ArrowExtensionArray(
|
||||
pa.array(result._data, mask=result._mask)
|
||||
)
|
||||
else:
|
||||
result = ArrowExtensionArray(
|
||||
pa.array(result.to_numpy(), from_pandas=True)
|
||||
)
|
||||
|
||||
return result, na_count
|
||||
|
||||
@overload
|
||||
def _do_date_conversions(
|
||||
self,
|
||||
names: Index,
|
||||
data: DataFrame,
|
||||
) -> DataFrame: ...
|
||||
|
||||
@overload
|
||||
def _do_date_conversions(
|
||||
self,
|
||||
names: Sequence[Hashable],
|
||||
data: Mapping[Hashable, ArrayLike],
|
||||
) -> Mapping[Hashable, ArrayLike]: ...
|
||||
|
||||
@final
|
||||
def _do_date_conversions(
|
||||
self,
|
||||
names: Sequence[Hashable] | Index,
|
||||
data: Mapping[Hashable, ArrayLike] | DataFrame,
|
||||
) -> Mapping[Hashable, ArrayLike] | DataFrame:
|
||||
if not isinstance(self.parse_dates, list):
|
||||
return data
|
||||
for colspec in self.parse_dates:
|
||||
if isinstance(colspec, int) and colspec not in data:
|
||||
colspec = names[colspec]
|
||||
if (isinstance(self.index_col, list) and colspec in self.index_col) or (
|
||||
isinstance(self.index_names, list) and colspec in self.index_names
|
||||
):
|
||||
continue
|
||||
result = date_converter(
|
||||
data[colspec],
|
||||
col=colspec,
|
||||
dayfirst=self.dayfirst,
|
||||
cache_dates=self.cache_dates,
|
||||
date_format=self.date_format,
|
||||
)
|
||||
# error: Unsupported target for indexed assignment
|
||||
# ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame")
|
||||
data[colspec] = result # type: ignore[index]
|
||||
|
||||
return data
|
||||
|
||||
@final
|
||||
def _check_data_length(
|
||||
self,
|
||||
columns: Sequence[Hashable],
|
||||
data: Sequence[ArrayLike],
|
||||
) -> None:
|
||||
"""Checks if length of data is equal to length of column names.
|
||||
|
||||
One set of trailing commas is allowed. self.index_col not False
|
||||
results in a ParserError previously when lengths do not match.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns: list of column names
|
||||
data: list of array-likes containing the data column-wise.
|
||||
"""
|
||||
if not self.index_col and len(columns) != len(data) and columns:
|
||||
empty_str = is_object_dtype(data[-1]) and data[-1] == ""
|
||||
# error: No overload variant of "__ror__" of "ndarray" matches
|
||||
# argument type "ExtensionArray"
|
||||
empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator]
|
||||
if len(columns) == len(data) - 1 and np.all(empty_str_or_na):
|
||||
return
|
||||
warnings.warn(
|
||||
"Length of header or names does not match length of data. This leads "
|
||||
"to a loss of data with index_col=False.",
|
||||
ParserWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
@final
|
||||
def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> SequenceT:
|
||||
"""
|
||||
Validates that all usecols are present in a given
|
||||
list of names. If not, raise a ValueError that
|
||||
shows what usecols are missing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
usecols : iterable of usecols
|
||||
The columns to validate are present in names.
|
||||
names : iterable of names
|
||||
The column names to check against.
|
||||
|
||||
Returns
|
||||
-------
|
||||
usecols : iterable of usecols
|
||||
The `usecols` parameter if the validation succeeds.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError : Columns were missing. Error message will list them.
|
||||
"""
|
||||
missing = [c for c in usecols if c not in names]
|
||||
if len(missing) > 0:
|
||||
raise ValueError(
|
||||
f"Usecols do not match columns, columns expected but not found: "
|
||||
f"{missing}"
|
||||
)
|
||||
|
||||
return usecols
|
||||
|
||||
@final
|
||||
def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]:
|
||||
if not is_index_col(index_col):
|
||||
return None, columns, index_col
|
||||
|
||||
columns = list(columns)
|
||||
|
||||
# In case of no rows and multiindex columns we have to set index_names to
|
||||
# list of Nones GH#38292
|
||||
if not columns:
|
||||
return [None] * len(index_col), columns, index_col
|
||||
|
||||
cp_cols = list(columns)
|
||||
index_names: list[str | int | None] = []
|
||||
|
||||
# don't mutate
|
||||
index_col = list(index_col)
|
||||
|
||||
for i, c in enumerate(index_col):
|
||||
if isinstance(c, str):
|
||||
index_names.append(c)
|
||||
for j, name in enumerate(cp_cols):
|
||||
if name == c:
|
||||
index_col[i] = j
|
||||
columns.remove(name)
|
||||
break
|
||||
else:
|
||||
name = cp_cols[c]
|
||||
columns.remove(name)
|
||||
index_names.append(name)
|
||||
|
||||
# Only clean index names that were placeholders.
|
||||
for i, name in enumerate(index_names):
|
||||
if isinstance(name, str) and name in self.unnamed_cols:
|
||||
index_names[i] = None
|
||||
|
||||
return index_names, columns, index_col
|
||||
|
||||
@final
|
||||
def _get_empty_meta(
|
||||
self, columns: Sequence[HashableT], dtype: DtypeArg | None = None
|
||||
) -> tuple[Index, list[HashableT], dict[HashableT, Series]]:
|
||||
columns = list(columns)
|
||||
|
||||
index_col = self.index_col
|
||||
index_names = self.index_names
|
||||
|
||||
# Convert `dtype` to a defaultdict of some kind.
|
||||
# This will enable us to write `dtype[col_name]`
|
||||
# without worrying about KeyError issues later on.
|
||||
dtype_dict: defaultdict[Hashable, Any]
|
||||
if not is_dict_like(dtype):
|
||||
# if dtype == None, default will be object.
|
||||
dtype_dict = defaultdict(lambda: dtype)
|
||||
else:
|
||||
dtype = cast(dict, dtype)
|
||||
dtype_dict = defaultdict(
|
||||
lambda: None,
|
||||
{columns[k] if is_integer(k) else k: v for k, v in dtype.items()},
|
||||
)
|
||||
|
||||
# Even though we have no data, the "index" of the empty DataFrame
|
||||
# could for example still be an empty MultiIndex. Thus, we need to
|
||||
# check whether we have any index columns specified, via either:
|
||||
#
|
||||
# 1) index_col (column indices)
|
||||
# 2) index_names (column names)
|
||||
#
|
||||
# Both must be non-null to ensure a successful construction. Otherwise,
|
||||
# we have to create a generic empty Index.
|
||||
index: Index
|
||||
if (index_col is None or index_col is False) or index_names is None:
|
||||
index = default_index(0)
|
||||
else:
|
||||
# TODO: We could return default_index(0) if dtype_dict[name] is None
|
||||
data = [
|
||||
Index([], name=name, dtype=dtype_dict[name]) for name in index_names
|
||||
]
|
||||
if len(data) == 1:
|
||||
index = data[0]
|
||||
else:
|
||||
index = MultiIndex.from_arrays(data)
|
||||
index_col.sort()
|
||||
|
||||
for i, n in enumerate(index_col):
|
||||
columns.pop(n - i)
|
||||
|
||||
col_dict = {
|
||||
col_name: Series([], dtype=dtype_dict[col_name]) for col_name in columns
|
||||
}
|
||||
|
||||
return index, columns, col_dict
|
||||
|
||||
|
||||
def date_converter(
|
||||
date_col,
|
||||
col: Hashable,
|
||||
dayfirst: bool = False,
|
||||
cache_dates: bool = True,
|
||||
date_format: dict[Hashable, str] | str | None = None,
|
||||
):
|
||||
if date_col.dtype.kind in "Mm":
|
||||
return date_col
|
||||
|
||||
date_fmt = date_format.get(col) if isinstance(date_format, dict) else date_format
|
||||
|
||||
str_objs = lib.ensure_string_array(np.asarray(date_col))
|
||||
try:
|
||||
result = tools.to_datetime(
|
||||
str_objs,
|
||||
format=date_fmt,
|
||||
utc=False,
|
||||
dayfirst=dayfirst,
|
||||
cache=cache_dates,
|
||||
)
|
||||
except (ValueError, TypeError):
|
||||
# test_usecols_with_parse_dates4
|
||||
# test_multi_index_parse_dates
|
||||
return str_objs
|
||||
|
||||
if isinstance(result, DatetimeIndex):
|
||||
arr = result.to_numpy()
|
||||
arr.flags.writeable = True
|
||||
return arr
|
||||
return result._values
|
||||
|
||||
|
||||
parser_defaults = {
|
||||
"delimiter": None,
|
||||
"escapechar": None,
|
||||
"quotechar": '"',
|
||||
"quoting": csv.QUOTE_MINIMAL,
|
||||
"doublequote": True,
|
||||
"skipinitialspace": False,
|
||||
"lineterminator": None,
|
||||
"header": "infer",
|
||||
"index_col": None,
|
||||
"names": None,
|
||||
"skiprows": None,
|
||||
"skipfooter": 0,
|
||||
"nrows": None,
|
||||
"na_values": None,
|
||||
"keep_default_na": True,
|
||||
"true_values": None,
|
||||
"false_values": None,
|
||||
"converters": None,
|
||||
"dtype": None,
|
||||
"cache_dates": True,
|
||||
"thousands": None,
|
||||
"comment": None,
|
||||
"decimal": ".",
|
||||
# 'engine': 'c',
|
||||
"parse_dates": False,
|
||||
"dayfirst": False,
|
||||
"date_format": None,
|
||||
"usecols": None,
|
||||
# 'iterator': False,
|
||||
"chunksize": None,
|
||||
"encoding": None,
|
||||
"compression": None,
|
||||
"skip_blank_lines": True,
|
||||
"encoding_errors": "strict",
|
||||
"on_bad_lines": ParserBase.BadLineHandleMethod.ERROR,
|
||||
"dtype_backend": lib.no_default,
|
||||
}
|
||||
|
||||
|
||||
def get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
|
||||
"""
|
||||
Get the NaN values for a given column.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
col : str
|
||||
The name of the column.
|
||||
na_values : array-like, dict
|
||||
The object listing the NaN values as strings.
|
||||
na_fvalues : array-like, dict
|
||||
The object listing the NaN values as floats.
|
||||
keep_default_na : bool
|
||||
If `na_values` is a dict, and the column is not mapped in the
|
||||
dictionary, whether to return the default NaN values or the empty set.
|
||||
|
||||
Returns
|
||||
-------
|
||||
nan_tuple : A length-two tuple composed of
|
||||
|
||||
1) na_values : the string NaN values for that column.
|
||||
2) na_fvalues : the float NaN values for that column.
|
||||
"""
|
||||
if isinstance(na_values, dict):
|
||||
if col in na_values:
|
||||
return na_values[col], na_fvalues[col]
|
||||
else:
|
||||
if keep_default_na:
|
||||
return STR_NA_VALUES, set()
|
||||
|
||||
return set(), set()
|
||||
else:
|
||||
return na_values, na_fvalues
|
||||
|
||||
|
||||
def is_index_col(col) -> bool:
|
||||
return col is not None and col is not False
|
||||
|
||||
|
||||
def validate_parse_dates_presence(
|
||||
parse_dates: bool | list, columns: Sequence[Hashable]
|
||||
) -> set:
|
||||
"""
|
||||
Check if parse_dates are in columns.
|
||||
|
||||
If user has provided names for parse_dates, check if those columns
|
||||
are available.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns : list
|
||||
List of names of the dataframe.
|
||||
|
||||
Returns
|
||||
-------
|
||||
The names of the columns which will get parsed later if a list
|
||||
is given as specification.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If column to parse_date is not in dataframe.
|
||||
|
||||
"""
|
||||
if not isinstance(parse_dates, list):
|
||||
return set()
|
||||
|
||||
missing = set()
|
||||
unique_cols = set()
|
||||
for col in parse_dates:
|
||||
if isinstance(col, str):
|
||||
if col not in columns:
|
||||
missing.add(col)
|
||||
else:
|
||||
unique_cols.add(col)
|
||||
elif col in columns:
|
||||
unique_cols.add(col)
|
||||
else:
|
||||
unique_cols.add(columns[col])
|
||||
if missing:
|
||||
missing_cols = ", ".join(sorted(missing))
|
||||
raise ValueError(f"Missing column provided to 'parse_dates': '{missing_cols}'")
|
||||
return unique_cols
|
||||
|
||||
|
||||
def _validate_usecols_arg(usecols):
|
||||
"""
|
||||
Validate the 'usecols' parameter.
|
||||
|
||||
Checks whether or not the 'usecols' parameter contains all integers
|
||||
(column selection by index), strings (column by name) or is a callable.
|
||||
Raises a ValueError if that is not the case.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
usecols : list-like, callable, or None
|
||||
List of columns to use when parsing or a callable that can be used
|
||||
to filter a list of table columns.
|
||||
|
||||
Returns
|
||||
-------
|
||||
usecols_tuple : tuple
|
||||
A tuple of (verified_usecols, usecols_dtype).
|
||||
|
||||
'verified_usecols' is either a set if an array-like is passed in or
|
||||
'usecols' if a callable or None is passed in.
|
||||
|
||||
'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
|
||||
is passed in or None if a callable or None is passed in.
|
||||
"""
|
||||
msg = (
|
||||
"'usecols' must either be list-like of all strings, all unicode, "
|
||||
"all integers or a callable."
|
||||
)
|
||||
if usecols is not None:
|
||||
if callable(usecols):
|
||||
return usecols, None
|
||||
|
||||
if not is_list_like(usecols):
|
||||
# see gh-20529
|
||||
#
|
||||
# Ensure it is iterable container but not string.
|
||||
raise ValueError(msg)
|
||||
|
||||
usecols_dtype = lib.infer_dtype(usecols, skipna=False)
|
||||
|
||||
if usecols_dtype not in ("empty", "integer", "string"):
|
||||
raise ValueError(msg)
|
||||
|
||||
usecols = set(usecols)
|
||||
|
||||
return usecols, usecols_dtype
|
||||
return usecols, None
|
||||
|
||||
|
||||
@overload
|
||||
def evaluate_callable_usecols(
|
||||
usecols: Callable[[Hashable], object],
|
||||
names: Iterable[Hashable],
|
||||
) -> set[int]: ...
|
||||
|
||||
|
||||
@overload
|
||||
def evaluate_callable_usecols(
|
||||
usecols: SequenceT, names: Iterable[Hashable]
|
||||
) -> SequenceT: ...
|
||||
|
||||
|
||||
def evaluate_callable_usecols(
|
||||
usecols: Callable[[Hashable], object] | SequenceT,
|
||||
names: Iterable[Hashable],
|
||||
) -> SequenceT | set[int]:
|
||||
"""
|
||||
Check whether or not the 'usecols' parameter
|
||||
is a callable. If so, enumerates the 'names'
|
||||
parameter and returns a set of indices for
|
||||
each entry in 'names' that evaluates to True.
|
||||
If not a callable, returns 'usecols'.
|
||||
"""
|
||||
if callable(usecols):
|
||||
return {i for i, name in enumerate(names) if usecols(name)}
|
||||
return usecols
|
||||
@ -0,0 +1,395 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import (
|
||||
lib,
|
||||
parsers,
|
||||
)
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import DtypeWarning
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.common import pandas_dtype
|
||||
from pandas.core.dtypes.concat import (
|
||||
concat_compat,
|
||||
union_categoricals,
|
||||
)
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype
|
||||
|
||||
from pandas.core.indexes.api import ensure_index_from_sequences
|
||||
|
||||
from pandas.io.common import (
|
||||
dedup_names,
|
||||
is_potential_multi_index,
|
||||
)
|
||||
from pandas.io.parsers.base_parser import (
|
||||
ParserBase,
|
||||
ParserError,
|
||||
date_converter,
|
||||
evaluate_callable_usecols,
|
||||
is_index_col,
|
||||
validate_parse_dates_presence,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import (
|
||||
Hashable,
|
||||
Mapping,
|
||||
Sequence,
|
||||
)
|
||||
|
||||
from pandas._typing import (
|
||||
AnyArrayLike,
|
||||
ArrayLike,
|
||||
DtypeArg,
|
||||
DtypeObj,
|
||||
ReadCsvBuffer,
|
||||
SequenceT,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
Index,
|
||||
MultiIndex,
|
||||
)
|
||||
|
||||
|
||||
class CParserWrapper(ParserBase):
|
||||
low_memory: bool
|
||||
_reader: parsers.TextReader
|
||||
|
||||
def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
|
||||
super().__init__(kwds)
|
||||
self.kwds = kwds
|
||||
kwds = kwds.copy()
|
||||
|
||||
self.low_memory = kwds.pop("low_memory", False)
|
||||
|
||||
# #2442
|
||||
kwds["allow_leading_cols"] = self.index_col is not False
|
||||
|
||||
# GH20529, validate usecol arg before TextReader
|
||||
kwds["usecols"] = self.usecols
|
||||
|
||||
# Have to pass int, would break tests using TextReader directly otherwise :(
|
||||
kwds["on_bad_lines"] = self.on_bad_lines.value
|
||||
|
||||
for key in (
|
||||
"storage_options",
|
||||
"encoding",
|
||||
"memory_map",
|
||||
"compression",
|
||||
):
|
||||
kwds.pop(key, None)
|
||||
|
||||
kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
|
||||
if "dtype_backend" not in kwds or kwds["dtype_backend"] is lib.no_default:
|
||||
kwds["dtype_backend"] = "numpy"
|
||||
if kwds["dtype_backend"] == "pyarrow":
|
||||
# Fail here loudly instead of in cython after reading
|
||||
import_optional_dependency("pyarrow")
|
||||
self._reader = parsers.TextReader(src, **kwds)
|
||||
|
||||
self.unnamed_cols = self._reader.unnamed_cols
|
||||
|
||||
passed_names = self.names is None
|
||||
|
||||
if self._reader.header is None:
|
||||
self.names = None
|
||||
else:
|
||||
(
|
||||
self.names,
|
||||
self.index_names,
|
||||
self.col_names,
|
||||
passed_names,
|
||||
) = self._extract_multi_indexer_columns(
|
||||
self._reader.header,
|
||||
self.index_names,
|
||||
passed_names,
|
||||
)
|
||||
|
||||
if self.names is None:
|
||||
self.names = list(range(self._reader.table_width))
|
||||
|
||||
# gh-9755
|
||||
#
|
||||
# need to set orig_names here first
|
||||
# so that proper indexing can be done
|
||||
# with _set_noconvert_columns
|
||||
#
|
||||
# once names has been filtered, we will
|
||||
# then set orig_names again to names
|
||||
self.orig_names = self.names[:]
|
||||
|
||||
if self.usecols:
|
||||
usecols = evaluate_callable_usecols(self.usecols, self.orig_names)
|
||||
|
||||
# GH 14671
|
||||
# assert for mypy, orig_names is List or None, None would error in issubset
|
||||
assert self.orig_names is not None
|
||||
if self.usecols_dtype == "string" and not set(usecols).issubset(
|
||||
self.orig_names
|
||||
):
|
||||
self._validate_usecols_names(usecols, self.orig_names)
|
||||
|
||||
if len(self.names) > len(usecols):
|
||||
self.names = [
|
||||
n
|
||||
for i, n in enumerate(self.names)
|
||||
if (i in usecols or n in usecols)
|
||||
]
|
||||
|
||||
if len(self.names) < len(usecols):
|
||||
self._validate_usecols_names(
|
||||
usecols,
|
||||
self.names,
|
||||
)
|
||||
|
||||
validate_parse_dates_presence(self.parse_dates, self.names)
|
||||
self._set_noconvert_columns()
|
||||
|
||||
self.orig_names = self.names
|
||||
|
||||
if self._reader.leading_cols == 0 and is_index_col(self.index_col):
|
||||
(
|
||||
index_names,
|
||||
self.names,
|
||||
self.index_col,
|
||||
) = self._clean_index_names(
|
||||
self.names,
|
||||
self.index_col,
|
||||
)
|
||||
|
||||
if self.index_names is None:
|
||||
self.index_names = index_names
|
||||
|
||||
if self._reader.header is None and not passed_names:
|
||||
assert self.index_names is not None
|
||||
self.index_names = [None] * len(self.index_names)
|
||||
|
||||
self._implicit_index = self._reader.leading_cols > 0
|
||||
|
||||
def close(self) -> None:
|
||||
# close handles opened by C parser
|
||||
try:
|
||||
self._reader.close()
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
def _set_noconvert_columns(self) -> None:
|
||||
"""
|
||||
Set the columns that should not undergo dtype conversions.
|
||||
|
||||
Currently, any column that is involved with date parsing will not
|
||||
undergo such conversions.
|
||||
"""
|
||||
assert self.orig_names is not None
|
||||
# error: Cannot determine type of 'names'
|
||||
|
||||
# much faster than using orig_names.index(x) xref GH#44106
|
||||
names_dict = {x: i for i, x in enumerate(self.orig_names)}
|
||||
col_indices = [names_dict[x] for x in self.names]
|
||||
noconvert_columns = self._set_noconvert_dtype_columns(
|
||||
col_indices,
|
||||
self.names,
|
||||
)
|
||||
for col in noconvert_columns:
|
||||
self._reader.set_noconvert(col)
|
||||
|
||||
def read(
|
||||
self,
|
||||
nrows: int | None = None,
|
||||
) -> tuple[
|
||||
Index | MultiIndex | None,
|
||||
Sequence[Hashable] | MultiIndex,
|
||||
Mapping[Hashable, AnyArrayLike],
|
||||
]:
|
||||
index: Index | MultiIndex | None
|
||||
column_names: Sequence[Hashable] | MultiIndex
|
||||
try:
|
||||
if self.low_memory:
|
||||
chunks = self._reader.read_low_memory(nrows)
|
||||
# destructive to chunks
|
||||
data = _concatenate_chunks(chunks, self.names)
|
||||
else:
|
||||
data = self._reader.read(nrows)
|
||||
except StopIteration:
|
||||
if self._first_chunk:
|
||||
self._first_chunk = False
|
||||
# assert for mypy, orig_names is List or None, None would error in
|
||||
# list(...) in dedup_names
|
||||
assert self.orig_names is not None
|
||||
names = dedup_names(
|
||||
self.orig_names,
|
||||
is_potential_multi_index(self.orig_names, self.index_col),
|
||||
)
|
||||
index, columns, col_dict = self._get_empty_meta(
|
||||
names,
|
||||
dtype=self.dtype,
|
||||
)
|
||||
# error: Incompatible types in assignment (expression has type
|
||||
# "list[Hashable] | MultiIndex", variable has type "list[Hashable]")
|
||||
columns = self._maybe_make_multi_index_columns( # type: ignore[assignment]
|
||||
columns, self.col_names
|
||||
)
|
||||
|
||||
columns = _filter_usecols(self.usecols, columns)
|
||||
columns_set = set(columns)
|
||||
|
||||
col_dict = {k: v for k, v in col_dict.items() if k in columns_set}
|
||||
|
||||
return index, columns, col_dict
|
||||
|
||||
else:
|
||||
self.close()
|
||||
raise
|
||||
|
||||
# Done with first read, next time raise StopIteration
|
||||
self._first_chunk = False
|
||||
|
||||
names = self.names
|
||||
|
||||
if self._reader.leading_cols:
|
||||
# implicit index, no index names
|
||||
arrays = []
|
||||
|
||||
if self.index_col and self._reader.leading_cols != len(self.index_col):
|
||||
raise ParserError(
|
||||
"Could not construct index. Requested to use "
|
||||
f"{len(self.index_col)} number of columns, but "
|
||||
f"{self._reader.leading_cols} left to parse."
|
||||
)
|
||||
|
||||
for i in range(self._reader.leading_cols):
|
||||
if self.index_col is None:
|
||||
values = data.pop(i)
|
||||
else:
|
||||
values = data.pop(self.index_col[i])
|
||||
|
||||
if self._should_parse_dates(i):
|
||||
values = date_converter(
|
||||
values,
|
||||
col=(
|
||||
self.index_names[i]
|
||||
if self.index_names is not None
|
||||
else None
|
||||
),
|
||||
dayfirst=self.dayfirst,
|
||||
cache_dates=self.cache_dates,
|
||||
date_format=self.date_format,
|
||||
)
|
||||
arrays.append(values)
|
||||
|
||||
index = ensure_index_from_sequences(arrays)
|
||||
|
||||
names = _filter_usecols(self.usecols, names)
|
||||
|
||||
names = dedup_names(names, is_potential_multi_index(names, self.index_col))
|
||||
|
||||
# rename dict keys
|
||||
data_tups = sorted(data.items())
|
||||
data = {k: v for k, (i, v) in zip(names, data_tups, strict=True)}
|
||||
|
||||
date_data = self._do_date_conversions(names, data)
|
||||
|
||||
# maybe create a mi on the columns
|
||||
column_names = self._maybe_make_multi_index_columns(names, self.col_names)
|
||||
|
||||
else:
|
||||
# rename dict keys
|
||||
data_tups = sorted(data.items())
|
||||
|
||||
# ugh, mutation
|
||||
|
||||
# assert for mypy, orig_names is List or None, None would error in list(...)
|
||||
assert self.orig_names is not None
|
||||
names = list(self.orig_names)
|
||||
names = dedup_names(names, is_potential_multi_index(names, self.index_col))
|
||||
|
||||
names = _filter_usecols(self.usecols, names)
|
||||
|
||||
# columns as list
|
||||
alldata = [x[1] for x in data_tups]
|
||||
if self.usecols is None:
|
||||
self._check_data_length(names, alldata)
|
||||
|
||||
data = {k: v for k, (i, v) in zip(names, data_tups, strict=False)}
|
||||
|
||||
date_data = self._do_date_conversions(names, data)
|
||||
index, column_names = self._make_index(alldata, names)
|
||||
|
||||
return index, column_names, date_data
|
||||
|
||||
|
||||
def _filter_usecols(usecols, names: SequenceT) -> SequenceT | list[Hashable]:
|
||||
# hackish
|
||||
usecols = evaluate_callable_usecols(usecols, names)
|
||||
if usecols is not None and len(names) != len(usecols):
|
||||
return [name for i, name in enumerate(names) if i in usecols or name in usecols]
|
||||
return names
|
||||
|
||||
|
||||
def _concatenate_chunks(
|
||||
chunks: list[dict[int, ArrayLike]], column_names: list[str]
|
||||
) -> dict:
|
||||
"""
|
||||
Concatenate chunks of data read with low_memory=True.
|
||||
|
||||
The tricky part is handling Categoricals, where different chunks
|
||||
may have different inferred categories.
|
||||
"""
|
||||
names = list(chunks[0].keys())
|
||||
warning_columns = []
|
||||
|
||||
result: dict = {}
|
||||
for name in names:
|
||||
arrs = [chunk.pop(name) for chunk in chunks]
|
||||
# Check each arr for consistent types.
|
||||
dtypes = {a.dtype for a in arrs}
|
||||
non_cat_dtypes = {x for x in dtypes if not isinstance(x, CategoricalDtype)}
|
||||
|
||||
dtype = dtypes.pop()
|
||||
if isinstance(dtype, CategoricalDtype):
|
||||
result[name] = union_categoricals(arrs, sort_categories=False)
|
||||
else:
|
||||
result[name] = concat_compat(arrs)
|
||||
if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object):
|
||||
warning_columns.append(column_names[name])
|
||||
|
||||
if warning_columns:
|
||||
warning_names = ", ".join(
|
||||
[f"{index}: {name}" for index, name in enumerate(warning_columns)]
|
||||
)
|
||||
warning_message = " ".join(
|
||||
[
|
||||
f"Columns ({warning_names}) have mixed types. "
|
||||
f"Specify dtype option on import or set low_memory=False."
|
||||
]
|
||||
)
|
||||
warnings.warn(warning_message, DtypeWarning, stacklevel=find_stack_level())
|
||||
return result
|
||||
|
||||
|
||||
def ensure_dtype_objs(
|
||||
dtype: DtypeArg | dict[Hashable, DtypeArg] | None,
|
||||
) -> DtypeObj | dict[Hashable, DtypeObj] | None:
|
||||
"""
|
||||
Ensure we have either None, a dtype object, or a dictionary mapping to
|
||||
dtype objects.
|
||||
"""
|
||||
if isinstance(dtype, defaultdict):
|
||||
# "None" not callable [misc]
|
||||
default_dtype = pandas_dtype(dtype.default_factory()) # type: ignore[misc]
|
||||
dtype_converted: defaultdict = defaultdict(lambda: default_dtype)
|
||||
for key in dtype.keys():
|
||||
dtype_converted[key] = pandas_dtype(dtype[key])
|
||||
return dtype_converted
|
||||
elif isinstance(dtype, dict):
|
||||
return {k: pandas_dtype(dtype[k]) for k in dtype}
|
||||
elif dtype is not None:
|
||||
return pandas_dtype(dtype)
|
||||
return dtype
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user