Initial commit: 首次建仓,建立目录结构

This commit is contained in:
FXY
2026-06-11 23:49:54 +08:00
commit 4038a476b5
9396 changed files with 2372905 additions and 0 deletions

View File

@ -0,0 +1,9 @@
# ruff: noqa: TC004
from typing import TYPE_CHECKING
if TYPE_CHECKING:
# import modules that have public classes/functions
from pandas.io.formats import style
# and mark only those modules as public
__all__ = ["style"]

View File

@ -0,0 +1,157 @@
# GH37967: Enable the use of CSS named colors, as defined in
# matplotlib.colors.CSS4_COLORS, when exporting to Excel.
# This data has been copied here, instead of being imported from matplotlib,
# not to have ``to_excel`` methods require matplotlib.
# source: matplotlib._color_data (3.3.3)
from __future__ import annotations
CSS4_COLORS = {
"aliceblue": "F0F8FF",
"antiquewhite": "FAEBD7",
"aqua": "00FFFF",
"aquamarine": "7FFFD4",
"azure": "F0FFFF",
"beige": "F5F5DC",
"bisque": "FFE4C4",
"black": "000000",
"blanchedalmond": "FFEBCD",
"blue": "0000FF",
"blueviolet": "8A2BE2",
"brown": "A52A2A",
"burlywood": "DEB887",
"cadetblue": "5F9EA0",
"chartreuse": "7FFF00",
"chocolate": "D2691E",
"coral": "FF7F50",
"cornflowerblue": "6495ED",
"cornsilk": "FFF8DC",
"crimson": "DC143C",
"cyan": "00FFFF",
"darkblue": "00008B",
"darkcyan": "008B8B",
"darkgoldenrod": "B8860B",
"darkgray": "A9A9A9",
"darkgreen": "006400",
"darkgrey": "A9A9A9",
"darkkhaki": "BDB76B",
"darkmagenta": "8B008B",
"darkolivegreen": "556B2F",
"darkorange": "FF8C00",
"darkorchid": "9932CC",
"darkred": "8B0000",
"darksalmon": "E9967A",
"darkseagreen": "8FBC8F",
"darkslateblue": "483D8B",
"darkslategray": "2F4F4F",
"darkslategrey": "2F4F4F",
"darkturquoise": "00CED1",
"darkviolet": "9400D3",
"deeppink": "FF1493",
"deepskyblue": "00BFFF",
"dimgray": "696969",
"dimgrey": "696969",
"dodgerblue": "1E90FF",
"firebrick": "B22222",
"floralwhite": "FFFAF0",
"forestgreen": "228B22",
"fuchsia": "FF00FF",
"gainsboro": "DCDCDC",
"ghostwhite": "F8F8FF",
"gold": "FFD700",
"goldenrod": "DAA520",
"gray": "808080",
"green": "008000",
"greenyellow": "ADFF2F",
"grey": "808080",
"honeydew": "F0FFF0",
"hotpink": "FF69B4",
"indianred": "CD5C5C",
"indigo": "4B0082",
"ivory": "FFFFF0",
"khaki": "F0E68C",
"lavender": "E6E6FA",
"lavenderblush": "FFF0F5",
"lawngreen": "7CFC00",
"lemonchiffon": "FFFACD",
"lightblue": "ADD8E6",
"lightcoral": "F08080",
"lightcyan": "E0FFFF",
"lightgoldenrodyellow": "FAFAD2",
"lightgray": "D3D3D3",
"lightgreen": "90EE90",
"lightgrey": "D3D3D3",
"lightpink": "FFB6C1",
"lightsalmon": "FFA07A",
"lightseagreen": "20B2AA",
"lightskyblue": "87CEFA",
"lightslategray": "778899",
"lightslategrey": "778899",
"lightsteelblue": "B0C4DE",
"lightyellow": "FFFFE0",
"lime": "00FF00",
"limegreen": "32CD32",
"linen": "FAF0E6",
"magenta": "FF00FF",
"maroon": "800000",
"mediumaquamarine": "66CDAA",
"mediumblue": "0000CD",
"mediumorchid": "BA55D3",
"mediumpurple": "9370DB",
"mediumseagreen": "3CB371",
"mediumslateblue": "7B68EE",
"mediumspringgreen": "00FA9A",
"mediumturquoise": "48D1CC",
"mediumvioletred": "C71585",
"midnightblue": "191970",
"mintcream": "F5FFFA",
"mistyrose": "FFE4E1",
"moccasin": "FFE4B5",
"navajowhite": "FFDEAD",
"navy": "000080",
"oldlace": "FDF5E6",
"olive": "808000",
"olivedrab": "6B8E23",
"orange": "FFA500",
"orangered": "FF4500",
"orchid": "DA70D6",
"palegoldenrod": "EEE8AA",
"palegreen": "98FB98",
"paleturquoise": "AFEEEE",
"palevioletred": "DB7093",
"papayawhip": "FFEFD5",
"peachpuff": "FFDAB9",
"peru": "CD853F",
"pink": "FFC0CB",
"plum": "DDA0DD",
"powderblue": "B0E0E6",
"purple": "800080",
"rebeccapurple": "663399",
"red": "FF0000",
"rosybrown": "BC8F8F",
"royalblue": "4169E1",
"saddlebrown": "8B4513",
"salmon": "FA8072",
"sandybrown": "F4A460",
"seagreen": "2E8B57",
"seashell": "FFF5EE",
"sienna": "A0522D",
"silver": "C0C0C0",
"skyblue": "87CEEB",
"slateblue": "6A5ACD",
"slategray": "708090",
"slategrey": "708090",
"snow": "FFFAFA",
"springgreen": "00FF7F",
"steelblue": "4682B4",
"tan": "D2B48C",
"teal": "008080",
"thistle": "D8BFD8",
"tomato": "FF6347",
"turquoise": "40E0D0",
"violet": "EE82EE",
"wheat": "F5DEB3",
"white": "FFFFFF",
"whitesmoke": "F5F5F5",
"yellow": "FFFF00",
"yellowgreen": "9ACD32",
}

View File

@ -0,0 +1,95 @@
"""
Internal module for console introspection
"""
from __future__ import annotations
from shutil import get_terminal_size
def get_console_size() -> tuple[int | None, int | None]:
"""
Return console size as tuple = (width, height).
Returns (None,None) in non-interactive session.
"""
from pandas import get_option
display_width = get_option("display.width")
display_height = get_option("display.max_rows")
# Consider
# interactive shell terminal, can detect term size
# interactive non-shell terminal (ipnb/ipqtconsole), cannot detect term
# size non-interactive script, should disregard term size
# in addition
# width,height have default values, but setting to 'None' signals
# should use Auto-Detection, But only in interactive shell-terminal.
# Simple. yeah.
if in_interactive_session():
if in_ipython_frontend():
# sane defaults for interactive non-shell terminal
# match default for width,height in config_init
from pandas._config.config import get_default_val
terminal_width = get_default_val("display.width")
terminal_height = get_default_val("display.max_rows")
else:
# pure terminal
terminal_width, terminal_height = get_terminal_size()
else:
terminal_width, terminal_height = None, None
# Note if the User sets width/Height to None (auto-detection)
# and we're in a script (non-inter), this will return (None,None)
# caller needs to deal.
return display_width or terminal_width, display_height or terminal_height
# ----------------------------------------------------------------------
# Detect our environment
def in_interactive_session() -> bool:
"""
Check if we're running in an interactive shell.
Returns
-------
bool
True if running under python/ipython interactive shell.
"""
from pandas import get_option
def check_main() -> bool:
try:
import __main__ as main
except ModuleNotFoundError:
return get_option("mode.sim_interactive")
return not hasattr(main, "__file__") or get_option("mode.sim_interactive")
try:
# error: Name '__IPYTHON__' is not defined
return __IPYTHON__ or check_main() # type: ignore[name-defined]
except NameError:
return check_main()
def in_ipython_frontend() -> bool:
"""
Check if we're inside an IPython zmq frontend.
Returns
-------
bool
"""
try:
# error: Name 'get_ipython' is not defined
ip = get_ipython() # type: ignore[name-defined]
return "zmq" in str(type(ip)).lower()
except NameError:
pass
return False

View File

@ -0,0 +1,425 @@
"""
Utilities for interpreting CSS from Stylers for formatting non-HTML outputs.
"""
from __future__ import annotations
import re
from typing import TYPE_CHECKING
import warnings
from pandas.errors import CSSWarning
from pandas.util._exceptions import find_stack_level
if TYPE_CHECKING:
from collections.abc import (
Callable,
Generator,
Iterable,
Iterator,
)
def _side_expander(prop_fmt: str) -> Callable:
"""
Wrapper to expand shorthand property into top, right, bottom, left properties
Parameters
----------
side : str
The border side to expand into properties
Returns
-------
function: Return to call when a 'border(-{side}): {value}' string is encountered
"""
def expand(self: CSSResolver, prop: str, value: str) -> Generator[tuple[str, str]]:
"""
Expand shorthand property into side-specific property (top, right, bottom, left)
Parameters
----------
prop (str): CSS property name
value (str): String token for property
Yields
------
Tuple (str, str): Expanded property, value
"""
tokens = value.split()
try:
mapping = self.SIDE_SHORTHANDS[len(tokens)]
except KeyError:
warnings.warn(
f'Could not expand "{prop}: {value}"',
CSSWarning,
stacklevel=find_stack_level(),
)
return
for key, idx in zip(self.SIDES, mapping, strict=True):
yield prop_fmt.format(key), tokens[idx]
return expand
def _border_expander(side: str = "") -> Callable:
"""
Wrapper to expand 'border' property into border color, style, and width properties
Parameters
----------
side : str
The border side to expand into properties
Returns
-------
function: Return to call when a 'border(-{side}): {value}' string is encountered
"""
if side != "":
side = f"-{side}"
def expand(self: CSSResolver, prop: str, value: str) -> Generator[tuple[str, str]]:
"""
Expand border into color, style, and width tuples
Parameters
----------
prop : str
CSS property name passed to styler
value : str
Value passed to styler for property
Yields
------
Tuple (str, str): Expanded property, value
"""
tokens = value.split()
if len(tokens) == 0 or len(tokens) > 3:
warnings.warn(
f'Too many tokens provided to "{prop}" (expected 1-3)',
CSSWarning,
stacklevel=find_stack_level(),
)
# TODO: Can we use current color as initial value to comply with CSS standards?
border_declarations = {
f"border{side}-color": "black",
f"border{side}-style": "none",
f"border{side}-width": "medium",
}
for token in tokens:
if token.lower() in self.BORDER_STYLES:
border_declarations[f"border{side}-style"] = token
elif any(ratio in token.lower() for ratio in self.BORDER_WIDTH_RATIOS):
border_declarations[f"border{side}-width"] = token
else:
border_declarations[f"border{side}-color"] = token
# TODO: Warn user if item entered more than once (e.g. "border: red green")
# Per CSS, "border" will reset previous "border-*" definitions
yield from self.atomize(border_declarations.items())
return expand
class CSSResolver:
"""
A callable for parsing and resolving CSS to atomic properties.
"""
UNIT_RATIOS = {
"pt": ("pt", 1),
"em": ("em", 1),
"rem": ("pt", 12),
"ex": ("em", 0.5),
# 'ch':
"px": ("pt", 0.75),
"pc": ("pt", 12),
"in": ("pt", 72),
"cm": ("in", 1 / 2.54),
"mm": ("in", 1 / 25.4),
"q": ("mm", 0.25),
"!!default": ("em", 0),
}
FONT_SIZE_RATIOS = UNIT_RATIOS.copy()
FONT_SIZE_RATIOS.update(
{
"%": ("em", 0.01),
"xx-small": ("rem", 0.5),
"x-small": ("rem", 0.625),
"small": ("rem", 0.8),
"medium": ("rem", 1),
"large": ("rem", 1.125),
"x-large": ("rem", 1.5),
"xx-large": ("rem", 2),
"smaller": ("em", 1 / 1.2),
"larger": ("em", 1.2),
"!!default": ("em", 1),
}
)
MARGIN_RATIOS = UNIT_RATIOS.copy()
MARGIN_RATIOS.update({"none": ("pt", 0)})
BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy()
BORDER_WIDTH_RATIOS.update(
{
"none": ("pt", 0),
"thick": ("px", 4),
"medium": ("px", 2),
"thin": ("px", 1),
# Default: medium only if solid
}
)
BORDER_STYLES = [
"none",
"hidden",
"dotted",
"dashed",
"solid",
"double",
"groove",
"ridge",
"inset",
"outset",
"mediumdashdot",
"dashdotdot",
"hair",
"mediumdashdotdot",
"dashdot",
"slantdashdot",
"mediumdashed",
]
SIDE_SHORTHANDS = {
1: [0, 0, 0, 0],
2: [0, 1, 0, 1],
3: [0, 1, 2, 1],
4: [0, 1, 2, 3],
}
SIDES = ("top", "right", "bottom", "left")
CSS_EXPANSIONS = {
**{
(f"border-{prop}" if prop else "border"): _border_expander(prop)
for prop in ["", "top", "right", "bottom", "left"]
},
**{
f"border-{prop}": _side_expander(f"border-{{:s}}-{prop}")
for prop in ["color", "style", "width"]
},
"margin": _side_expander("margin-{:s}"),
"padding": _side_expander("padding-{:s}"),
}
def __call__(
self,
declarations: str | Iterable[tuple[str, str]],
inherited: dict[str, str] | None = None,
) -> dict[str, str]:
"""
The given declarations to atomic properties.
Parameters
----------
declarations_str : str | Iterable[tuple[str, str]]
A CSS string or set of CSS declaration tuples
e.g. "font-weight: bold; background: blue" or
{("font-weight", "bold"), ("background", "blue")}
inherited : dict, optional
Atomic properties indicating the inherited style context in which
declarations_str is to be resolved. ``inherited`` should already
be resolved, i.e. valid output of this method.
Returns
-------
dict
Atomic CSS 2.2 properties.
Examples
--------
>>> resolve = CSSResolver()
>>> inherited = {"font-family": "serif", "font-weight": "bold"}
>>> out = resolve(
... '''
... border-color: BLUE RED;
... font-size: 1em;
... font-size: 2em;
... font-weight: normal;
... font-weight: inherit;
... ''',
... inherited,
... )
>>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE
[('border-bottom-color', 'blue'),
('border-left-color', 'red'),
('border-right-color', 'red'),
('border-top-color', 'blue'),
('font-family', 'serif'),
('font-size', '24pt'),
('font-weight', 'bold')]
"""
if isinstance(declarations, str):
declarations = self.parse(declarations)
props = dict(self.atomize(declarations))
if inherited is None:
inherited = {}
props = self._update_initial(props, inherited)
props = self._update_font_size(props, inherited)
return self._update_other_units(props)
def _update_initial(
self,
props: dict[str, str],
inherited: dict[str, str],
) -> dict[str, str]:
# 1. resolve inherited, initial
for prop, val in inherited.items():
if prop not in props:
props[prop] = val
new_props = props.copy()
for prop, val in props.items():
if val == "inherit":
val = inherited.get(prop, "initial")
if val in ("initial", None):
# we do not define a complete initial stylesheet
del new_props[prop]
else:
new_props[prop] = val
return new_props
def _update_font_size(
self,
props: dict[str, str],
inherited: dict[str, str],
) -> dict[str, str]:
# 2. resolve relative font size
if props.get("font-size"):
props["font-size"] = self.size_to_pt(
props["font-size"],
self._get_font_size(inherited),
conversions=self.FONT_SIZE_RATIOS,
)
return props
def _get_font_size(self, props: dict[str, str]) -> float | None:
if props.get("font-size"):
font_size_string = props["font-size"]
return self._get_float_font_size_from_pt(font_size_string)
return None
def _get_float_font_size_from_pt(self, font_size_string: str) -> float:
assert font_size_string.endswith("pt")
return float(font_size_string.rstrip("pt"))
def _update_other_units(self, props: dict[str, str]) -> dict[str, str]:
font_size = self._get_font_size(props)
# 3. TODO: resolve other font-relative units
for side in self.SIDES:
prop = f"border-{side}-width"
if prop in props:
props[prop] = self.size_to_pt(
props[prop],
em_pt=font_size,
conversions=self.BORDER_WIDTH_RATIOS,
)
for prop in [f"margin-{side}", f"padding-{side}"]:
if prop in props:
# TODO: support %
props[prop] = self.size_to_pt(
props[prop],
em_pt=font_size,
conversions=self.MARGIN_RATIOS,
)
return props
def size_to_pt(
self, in_val: str, em_pt: float | None = None, conversions: dict = UNIT_RATIOS
) -> str:
def _error() -> str:
warnings.warn(
f"Unhandled size: {in_val!r}",
CSSWarning,
stacklevel=find_stack_level(),
)
return self.size_to_pt("1!!default", conversions=conversions)
match = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val)
if match is None:
return _error()
val, unit = match.groups()
if val == "":
# hack for 'large' etc.
val = 1
else:
try:
val = float(val)
except ValueError:
return _error()
while unit != "pt":
if unit == "em":
if em_pt is None:
unit = "rem"
else:
val *= em_pt
unit = "pt"
continue
try:
unit, mul = conversions[unit]
except KeyError:
return _error()
val *= mul
val = round(val, 5)
if int(val) == val:
size_fmt = f"{int(val):d}pt"
else:
size_fmt = f"{val:f}pt"
return size_fmt
def atomize(self, declarations: Iterable) -> Generator[tuple[str, str]]:
for prop, value in declarations:
prop = prop.lower()
value = value.lower()
if prop in self.CSS_EXPANSIONS:
expand = self.CSS_EXPANSIONS[prop]
yield from expand(self, prop, value)
else:
yield prop, value
def parse(self, declarations_str: str) -> Iterator[tuple[str, str]]:
"""
Generates (prop, value) pairs from declarations.
In a future version may generate parsed tokens from tinycss/tinycss2
Parameters
----------
declarations_str : str
"""
for decl in declarations_str.split(";"):
if not decl.strip():
continue
prop, sep, val = decl.partition(":")
prop = prop.strip().lower()
# TODO: don't lowercase case sensitive parts of values (strings)
val = val.strip().lower()
if sep:
yield prop, val
else:
warnings.warn(
f"Ill-formatted attribute: expected a colon in {decl!r}",
CSSWarning,
stacklevel=find_stack_level(),
)

View File

@ -0,0 +1,336 @@
"""
Module for formatting output data into CSV files.
"""
from __future__ import annotations
from collections.abc import (
Hashable,
Iterable,
Iterator,
Sequence,
)
import csv as csvlib
import os
from typing import (
TYPE_CHECKING,
Any,
cast,
)
import numpy as np
from pandas._libs import writers as libwriters
from pandas._typing import SequenceNotStr
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.generic import (
ABCDatetimeIndex,
ABCIndex,
ABCMultiIndex,
ABCPeriodIndex,
)
from pandas.core.dtypes.missing import notna
from pandas.core.indexes.api import Index
from pandas.io.common import get_handle
if TYPE_CHECKING:
from pandas._typing import (
CompressionOptions,
FilePath,
FloatFormatType,
IndexLabel,
StorageOptions,
WriteBuffer,
npt,
)
from pandas.io.formats.format import DataFrameFormatter
_DEFAULT_CHUNKSIZE_CELLS = 100_000
class CSVFormatter:
cols: npt.NDArray[np.object_]
def __init__(
self,
formatter: DataFrameFormatter,
path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] = "",
sep: str = ",",
cols: Sequence[Hashable] | None = None,
index_label: IndexLabel | None = None,
mode: str = "w",
encoding: str | None = None,
errors: str = "strict",
compression: CompressionOptions = "infer",
quoting: int | None = None,
lineterminator: str | None = "\n",
chunksize: int | None = None,
quotechar: str | None = '"',
date_format: str | None = None,
doublequote: bool = True,
escapechar: str | None = None,
storage_options: StorageOptions | None = None,
) -> None:
self.fmt = formatter
self.obj = self.fmt.frame
self.filepath_or_buffer = path_or_buf
self.encoding = encoding
self.compression: CompressionOptions = compression
self.mode = mode
self.storage_options = storage_options
self.sep = sep
self.index_label = self._initialize_index_label(index_label)
self.errors = errors
self.quoting = quoting or csvlib.QUOTE_MINIMAL
self.doublequote = doublequote
self.escapechar = escapechar
self.quotechar = self._initialize_quotechar(quotechar)
self.lineterminator = lineterminator or os.linesep
self.date_format = date_format
self.cols = self._initialize_columns(cols)
self.chunksize = self._initialize_chunksize(chunksize)
@property
def na_rep(self) -> str:
return self.fmt.na_rep
@property
def float_format(self) -> FloatFormatType | None:
return self.fmt.float_format
@property
def decimal(self) -> str:
return self.fmt.decimal
@property
def header(self) -> bool | SequenceNotStr[str]:
return self.fmt.header
@property
def index(self) -> bool:
return self.fmt.index
def _initialize_index_label(self, index_label: IndexLabel | None) -> IndexLabel:
if index_label is not False:
if index_label is None:
return self._get_index_label_from_obj()
elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndex)):
# given a string for a DF with Index
return [index_label]
return index_label
def _get_index_label_from_obj(self) -> Sequence[Hashable]:
if isinstance(self.obj.index, ABCMultiIndex):
return self._get_index_label_multiindex()
else:
return self._get_index_label_flat()
def _get_index_label_multiindex(self) -> Sequence[Hashable]:
return [name or "" for name in self.obj.index.names]
def _get_index_label_flat(self) -> Sequence[Hashable]:
index_label = self.obj.index.name
return [""] if index_label is None else [index_label]
def _initialize_quotechar(self, quotechar: str | None) -> str | None:
if self.quoting != csvlib.QUOTE_NONE or self.escapechar is not None:
# prevents crash in _csv
return quotechar
return None
@property
def has_mi_columns(self) -> bool:
return bool(isinstance(self.obj.columns, ABCMultiIndex))
def _initialize_columns(
self, cols: Iterable[Hashable] | None
) -> npt.NDArray[np.object_]:
# validate mi options
if self.has_mi_columns:
if cols is not None:
msg = "cannot specify cols with a MultiIndex on the columns"
raise TypeError(msg)
if cols is not None:
if isinstance(cols, ABCIndex):
cols = cols._get_values_for_csv(**self._number_format)
else:
cols = list(cols)
self.obj = self.obj.loc[:, cols]
# update columns to include possible multiplicity of dupes
# and make sure cols is just a list of labels
new_cols = self.obj.columns
return new_cols._get_values_for_csv(**self._number_format)
def _initialize_chunksize(self, chunksize: int | None) -> int:
if chunksize is None:
return (_DEFAULT_CHUNKSIZE_CELLS // (len(self.cols) or 1)) or 1
return int(chunksize)
@property
def _number_format(self) -> dict[str, Any]:
"""Dictionary used for storing number formatting settings."""
return {
"na_rep": self.na_rep,
"float_format": self.float_format,
"date_format": self.date_format,
"quoting": self.quoting,
"decimal": self.decimal,
}
@cache_readonly
def data_index(self) -> Index:
data_index = self.obj.index
if (
isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex))
and self.date_format is not None
):
data_index = Index(
[x.strftime(self.date_format) if notna(x) else "" for x in data_index]
)
elif isinstance(data_index, ABCMultiIndex):
data_index = data_index.remove_unused_levels()
return data_index
@property
def nlevels(self) -> int:
if self.index:
return getattr(self.data_index, "nlevels", 1)
else:
return 0
@property
def _has_aliases(self) -> bool:
return isinstance(self.header, (tuple, list, np.ndarray, ABCIndex))
@property
def _need_to_save_header(self) -> bool:
return bool(self._has_aliases or self.header)
@property
def write_cols(self) -> SequenceNotStr[Hashable]:
if self._has_aliases:
assert not isinstance(self.header, bool)
if len(self.header) != len(self.cols):
raise ValueError(
f"Writing {len(self.cols)} cols but got {len(self.header)} aliases"
)
return self.header
else:
# self.cols is an ndarray derived from Index._get_values_for_csv,
# so its entries are strings, i.e. hashable
return cast(SequenceNotStr[Hashable], self.cols)
@property
def encoded_labels(self) -> list[Hashable]:
encoded_labels: list[Hashable] = []
if self.index and self.index_label:
assert isinstance(self.index_label, Sequence)
encoded_labels = list(self.index_label)
if not self.has_mi_columns or self._has_aliases:
encoded_labels += list(self.write_cols)
return encoded_labels
def save(self) -> None:
"""
Create the writer & save.
"""
# apply compression and byte/text conversion
with get_handle(
self.filepath_or_buffer,
self.mode,
encoding=self.encoding,
errors=self.errors,
compression=self.compression,
storage_options=self.storage_options,
) as handles:
# Note: self.encoding is irrelevant here
# error: Argument "quoting" to "writer" has incompatible type "int";
# expected "Literal[0, 1, 2, 3]"
self.writer = csvlib.writer(
handles.handle,
lineterminator=self.lineterminator,
delimiter=self.sep,
quoting=self.quoting, # type: ignore[arg-type]
doublequote=self.doublequote,
escapechar=self.escapechar,
quotechar=self.quotechar,
)
self._save()
def _save(self) -> None:
if self._need_to_save_header:
self._save_header()
self._save_body()
def _save_header(self) -> None:
if not self.has_mi_columns or self._has_aliases:
self.writer.writerow(self.encoded_labels)
else:
for row in self._generate_multiindex_header_rows():
self.writer.writerow(row)
def _generate_multiindex_header_rows(self) -> Iterator[list[Hashable]]:
columns = self.obj.columns
for i in range(columns.nlevels):
# we need at least 1 index column to write our col names
col_line = []
if self.index:
# name is the first column
col_line.append(columns.names[i])
if isinstance(self.index_label, list) and len(self.index_label) > 1:
col_line.extend([""] * (len(self.index_label) - 1))
col_line.extend(columns._get_level_values(i))
yield col_line
# Write out the index line if it's not empty.
# Otherwise, we will print out an extraneous
# blank line between the mi and the data rows.
if self.encoded_labels and set(self.encoded_labels) != {""}:
yield self.encoded_labels + [""] * len(columns)
def _save_body(self) -> None:
nrows = len(self.data_index)
chunks = (nrows // self.chunksize) + 1
for i in range(chunks):
start_i = i * self.chunksize
end_i = min(start_i + self.chunksize, nrows)
if start_i >= end_i:
break
self._save_chunk(start_i, end_i)
def _save_chunk(self, start_i: int, end_i: int) -> None:
# create the data for a chunk
slicer = slice(start_i, end_i)
df = self.obj.iloc[slicer]
res = df._get_values_for_csv(**self._number_format)
data = list(res._iter_column_arrays())
ix = (
self.data_index[slicer]._get_values_for_csv(**self._number_format)
if self.nlevels != 0
else np.empty(end_i - start_i)
)
libwriters.write_csv_rows(
data,
ix,
self.nlevels,
self.cols,
self.writer,
)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,657 @@
"""
Module for formatting output data in HTML.
"""
from __future__ import annotations
from textwrap import dedent
from typing import (
TYPE_CHECKING,
Any,
Final,
cast,
)
from pandas._config import get_option
from pandas._libs import lib
from pandas import (
MultiIndex,
option_context,
)
from pandas.io.common import is_url
from pandas.io.formats.format import (
DataFrameFormatter,
get_level_lengths,
)
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Iterable,
Mapping,
)
class HTMLFormatter:
"""
Internal class for formatting output data in html.
This class is intended for shared functionality between
DataFrame.to_html() and DataFrame._repr_html_().
Any logic in common with other output formatting methods
should ideally be inherited from classes in format.py
and this class responsible for only producing html markup.
"""
indent_delta: Final = 2
def __init__(
self,
formatter: DataFrameFormatter,
classes: str | list[str] | tuple[str, ...] | None = None,
border: int | bool | None = None,
table_id: str | None = None,
render_links: bool = False,
) -> None:
self.fmt = formatter
self.classes = classes
self.frame = self.fmt.frame
self.columns = self.fmt.tr_frame.columns
self.elements: list[str] = []
self.bold_rows = self.fmt.bold_rows
self.escape = self.fmt.escape
self.show_dimensions = self.fmt.show_dimensions
if border is None or border is True:
border = cast(int, get_option("display.html.border"))
elif not border:
border = None
self.border = border
self.table_id = table_id
self.render_links = render_links
self.col_space = {}
is_multi_index = isinstance(self.columns, MultiIndex)
for column, value in self.fmt.col_space.items():
col_space_value = f"{value}px" if isinstance(value, int) else value
self.col_space[column] = col_space_value
# GH 53885: Handling case where column is index
# Flatten the data in the multi index and add in the map
if is_multi_index and isinstance(column, tuple):
for column_index in column:
self.col_space[str(column_index)] = col_space_value
def to_string(self) -> str:
lines = self.render()
if any(isinstance(x, str) for x in lines):
lines = [str(x) for x in lines]
return "\n".join(lines)
def render(self) -> list[str]:
self._write_table()
if self.should_show_dimensions:
by = chr(215) # × # noqa: RUF003
self.write(
f"<p>{len(self.frame)} rows {by} {len(self.frame.columns)} columns</p>"
)
return self.elements
@property
def should_show_dimensions(self) -> bool:
return self.fmt.should_show_dimensions
@property
def show_row_idx_names(self) -> bool:
return self.fmt.show_row_idx_names
@property
def show_col_idx_names(self) -> bool:
return self.fmt.show_col_idx_names
@property
def row_levels(self) -> int:
if self.fmt.index:
# showing (row) index
return self.frame.index.nlevels
elif self.show_col_idx_names:
# see gh-22579
# Column misalignment also occurs for
# a standard index when the columns index is named.
# If the row index is not displayed a column of
# blank cells need to be included before the DataFrame values.
return 1
# not showing (row) index
return 0
def _get_columns_formatted_values(self) -> Iterable:
return self.columns
@property
def is_truncated(self) -> bool:
return self.fmt.is_truncated
@property
def ncols(self) -> int:
return len(self.fmt.tr_frame.columns)
def write(self, s: Any, indent: int = 0) -> None:
rs = pprint_thing(s)
self.elements.append(" " * indent + rs)
def write_th(
self, s: Any, header: bool = False, indent: int = 0, tags: str | None = None
) -> None:
"""
Method for writing a formatted <th> cell.
If col_space is set on the formatter then that is used for
the value of min-width.
Parameters
----------
s : object
The data to be written inside the cell.
header : bool, default False
Set to True if the <th> is for use inside <thead>. This will
cause min-width to be set if there is one.
indent : int, default 0
The indentation level of the cell.
tags : str, default None
Tags to include in the cell.
Returns
-------
A written <th> cell.
"""
col_space = self.col_space.get(s, None)
if header and col_space is not None:
tags = tags or ""
tags += f'style="min-width: {col_space};"'
self._write_cell(s, kind="th", indent=indent, tags=tags)
def write_td(self, s: Any, indent: int = 0, tags: str | None = None) -> None:
self._write_cell(s, kind="td", indent=indent, tags=tags)
def _write_cell(
self, s: Any, kind: str = "td", indent: int = 0, tags: str | None = None
) -> None:
if tags is not None:
start_tag = f"<{kind} {tags}>"
else:
start_tag = f"<{kind}>"
if self.escape:
# escape & first to prevent double escaping of &
esc = {"&": r"&amp;", "<": r"&lt;", ">": r"&gt;"}
else:
esc = {}
rs = pprint_thing(s, escape_chars=esc).strip()
# replace spaces betweens strings with non-breaking spaces
rs = rs.replace(" ", "&nbsp;&nbsp;")
if self.render_links and is_url(rs):
rs_unescaped = pprint_thing(s, escape_chars={}).strip()
start_tag += f'<a href="{rs_unescaped}" target="_blank">'
end_a = "</a>"
else:
end_a = ""
self.write(f"{start_tag}{rs}{end_a}</{kind}>", indent)
def write_tr(
self,
line: Iterable,
indent: int = 0,
indent_delta: int = 0,
header: bool = False,
align: str | None = None,
tags: dict[int, str] | None = None,
nindex_levels: int = 0,
) -> None:
if tags is None:
tags = {}
if align is None:
self.write("<tr>", indent)
else:
self.write(f'<tr style="text-align: {align};">', indent)
indent += indent_delta
for i, s in enumerate(line):
val_tag = tags.get(i, None)
if header or (self.bold_rows and i < nindex_levels):
self.write_th(s, indent=indent, header=header, tags=val_tag)
else:
self.write_td(s, indent, tags=val_tag)
indent -= indent_delta
self.write("</tr>", indent)
def _write_table(self, indent: int = 0) -> None:
_classes = ["dataframe"] # Default class.
use_mathjax = get_option("display.html.use_mathjax")
if not use_mathjax:
_classes.append("tex2jax_ignore")
_classes.append("mathjax_ignore")
if self.classes is not None:
if isinstance(self.classes, str):
self.classes = self.classes.split()
if not isinstance(self.classes, (list, tuple)):
raise TypeError(
"classes must be a string, list, "
f"or tuple, not {type(self.classes)}"
)
_classes.extend(self.classes)
if self.table_id is None:
id_section = ""
else:
id_section = f' id="{self.table_id}"'
if self.border is None:
border_attr = ""
else:
border_attr = f' border="{self.border}"'
self.write(
f'<table{border_attr} class="{" ".join(_classes)}"{id_section}>',
indent,
)
if self.fmt.header or self.show_row_idx_names:
self._write_header(indent + self.indent_delta)
self._write_body(indent + self.indent_delta)
self.write("</table>", indent)
def _write_col_header(self, indent: int) -> None:
row: list[Hashable]
is_truncated_horizontally = self.fmt.is_truncated_horizontally
if isinstance(self.columns, MultiIndex):
template = 'colspan="{span:d}" halign="left"'
sentinel: lib.NoDefault | bool
if self.fmt.sparsify:
# GH3547
sentinel = lib.no_default
else:
sentinel = False
levels = self.columns._format_multi(sparsify=sentinel, include_names=False)
level_lengths = get_level_lengths(levels, sentinel)
inner_lvl = len(level_lengths) - 1
for lnum, (records, values) in enumerate(
zip(level_lengths, levels, strict=True)
):
if is_truncated_horizontally:
# modify the header lines
ins_col = self.fmt.tr_col_num
if self.fmt.sparsify:
recs_new = {}
# Increment tags after ... col.
for tag, span in list(records.items()):
if tag >= ins_col:
recs_new[tag + 1] = span
elif tag + span > ins_col:
recs_new[tag] = span + 1
if lnum == inner_lvl:
values = (
*values[:ins_col],
"...",
*values[ins_col:],
)
else:
# sparse col headers do not receive a ...
values = (
*values[:ins_col],
values[ins_col - 1],
*values[ins_col:],
)
else:
recs_new[tag] = span
# if ins_col lies between tags, all col headers
# get ...
if tag + span == ins_col:
recs_new[ins_col] = 1
values = (*values[:ins_col], "...", *values[ins_col:])
records = recs_new
inner_lvl = len(level_lengths) - 1
if lnum == inner_lvl:
records[ins_col] = 1
else:
recs_new = {}
for tag, span in list(records.items()):
if tag >= ins_col:
recs_new[tag + 1] = span
else:
recs_new[tag] = span
recs_new[ins_col] = 1
records = recs_new
values = [*values[:ins_col], "...", *values[ins_col:]]
# see gh-22579
# Column Offset Bug with to_html(index=False) with
# MultiIndex Columns and Index.
# Initially fill row with blank cells before column names.
# TODO: Refactor to remove code duplication with code
# block below for standard columns index.
row = [""] * (self.row_levels - 1)
if self.fmt.index or self.show_col_idx_names:
# see gh-22747
# If to_html(index_names=False) do not show columns
# index names.
# TODO: Refactor to use _get_column_name_list from
# DataFrameFormatter class and create a
# _get_formatted_column_labels function for code
# parity with DataFrameFormatter class.
if self.fmt.show_index_names:
name = self.columns.names[lnum]
row.append(pprint_thing(name or ""))
else:
row.append("")
tags = {}
j = len(row)
for i, v in enumerate(values):
if i in records:
if records[i] > 1:
tags[j] = template.format(span=records[i])
else:
continue
j += 1
row.append(v)
self.write_tr(row, indent, self.indent_delta, tags=tags, header=True)
else:
# see gh-22579
# Column misalignment also occurs for
# a standard index when the columns index is named.
# Initially fill row with blank cells before column names.
# TODO: Refactor to remove code duplication with code block
# above for columns MultiIndex.
row = [""] * (self.row_levels - 1)
if self.fmt.index or self.show_col_idx_names:
# see gh-22747
# If to_html(index_names=False) do not show columns
# index names.
# TODO: Refactor to use _get_column_name_list from
# DataFrameFormatter class.
if self.fmt.show_index_names:
row.append(self.columns.name or "")
else:
row.append("")
row.extend(self._get_columns_formatted_values())
align = self.fmt.justify
if is_truncated_horizontally:
ins_col = self.row_levels + self.fmt.tr_col_num
row.insert(ins_col, "...")
self.write_tr(row, indent, self.indent_delta, header=True, align=align)
def _write_row_header(self, indent: int) -> None:
is_truncated_horizontally = self.fmt.is_truncated_horizontally
row = [x if x is not None else "" for x in self.frame.index.names] + [""] * (
self.ncols + (1 if is_truncated_horizontally else 0)
)
self.write_tr(row, indent, self.indent_delta, header=True)
def _write_header(self, indent: int) -> None:
self.write("<thead>", indent)
if self.fmt.header:
self._write_col_header(indent + self.indent_delta)
if self.show_row_idx_names:
self._write_row_header(indent + self.indent_delta)
self.write("</thead>", indent)
def _get_formatted_values(self) -> dict[int, list[str]]:
with option_context("display.max_colwidth", None):
fmt_values = {i: self.fmt.format_col(i) for i in range(self.ncols)}
return fmt_values
def _write_body(self, indent: int) -> None:
self.write("<tbody>", indent)
fmt_values = self._get_formatted_values()
# write values
if self.fmt.index and isinstance(self.frame.index, MultiIndex):
self._write_hierarchical_rows(fmt_values, indent + self.indent_delta)
else:
self._write_regular_rows(fmt_values, indent + self.indent_delta)
self.write("</tbody>", indent)
def _write_regular_rows(
self, fmt_values: Mapping[int, list[str]], indent: int
) -> None:
is_truncated_horizontally = self.fmt.is_truncated_horizontally
is_truncated_vertically = self.fmt.is_truncated_vertically
nrows = len(self.fmt.tr_frame)
if self.fmt.index:
fmt = self.fmt._get_formatter("__index__")
if fmt is not None:
index_values = self.fmt.tr_frame.index.map(fmt)
else:
# only reached with non-Multi index
index_values = self.fmt.tr_frame.index._format_flat(include_name=False)
row: list[str] = []
for i in range(nrows):
if is_truncated_vertically and i == (self.fmt.tr_row_num):
str_sep_row = ["..."] * len(row)
self.write_tr(
str_sep_row,
indent,
self.indent_delta,
tags=None,
nindex_levels=self.row_levels,
)
row = []
if self.fmt.index:
row.append(index_values[i])
# see gh-22579
# Column misalignment also occurs for
# a standard index when the columns index is named.
# Add blank cell before data cells.
elif self.show_col_idx_names:
row.append("")
row.extend(fmt_values[j][i] for j in range(self.ncols))
if is_truncated_horizontally:
dot_col_ix = self.fmt.tr_col_num + self.row_levels
row.insert(dot_col_ix, "...")
self.write_tr(
row, indent, self.indent_delta, tags=None, nindex_levels=self.row_levels
)
def _write_hierarchical_rows(
self, fmt_values: Mapping[int, list[str]], indent: int
) -> None:
template = 'rowspan="{span}" valign="top"'
is_truncated_horizontally = self.fmt.is_truncated_horizontally
is_truncated_vertically = self.fmt.is_truncated_vertically
frame = self.fmt.tr_frame
nrows = len(frame)
assert isinstance(frame.index, MultiIndex)
idx_values = frame.index._format_multi(sparsify=False, include_names=False)
idx_values = list(zip(*idx_values, strict=True))
if self.fmt.sparsify:
# GH3547
sentinel = lib.no_default
levels = frame.index._format_multi(sparsify=sentinel, include_names=False)
level_lengths = get_level_lengths(levels, sentinel)
inner_lvl = len(level_lengths) - 1
if is_truncated_vertically:
# Insert ... row and adjust idx_values and
# level_lengths to take this into account.
ins_row = self.fmt.tr_row_num
inserted = False
for lnum, records in enumerate(level_lengths):
rec_new = {}
for tag, span in list(records.items()):
if tag >= ins_row:
rec_new[tag + 1] = span
elif tag + span > ins_row:
rec_new[tag] = span + 1
# GH 14882 - Make sure insertion done once
if not inserted:
dot_row = list(idx_values[ins_row - 1])
dot_row[-1] = "..."
idx_values.insert(ins_row, tuple(dot_row))
inserted = True
else:
dot_row = list(idx_values[ins_row])
dot_row[inner_lvl - lnum] = "..."
idx_values[ins_row] = tuple(dot_row)
else:
rec_new[tag] = span
# If ins_row lies between tags, all cols idx cols
# receive ...
if tag + span == ins_row:
rec_new[ins_row] = 1
if lnum == 0:
idx_values.insert(
ins_row, tuple(["..."] * len(level_lengths))
)
# GH 14882 - Place ... in correct level
elif inserted:
dot_row = list(idx_values[ins_row])
dot_row[inner_lvl - lnum] = "..."
idx_values[ins_row] = tuple(dot_row)
level_lengths[lnum] = rec_new
level_lengths[inner_lvl][ins_row] = 1
for ix_col in fmt_values:
fmt_values[ix_col].insert(ins_row, "...")
nrows += 1
for i in range(nrows):
row = []
tags = {}
sparse_offset = 0
j = 0
for records, v in zip(level_lengths, idx_values[i], strict=True):
if i in records:
if records[i] > 1:
tags[j] = template.format(span=records[i])
else:
sparse_offset += 1
continue
j += 1
row.append(v)
row.extend(fmt_values[j][i] for j in range(self.ncols))
if is_truncated_horizontally:
row.insert(
self.row_levels - sparse_offset + self.fmt.tr_col_num, "..."
)
self.write_tr(
row,
indent,
self.indent_delta,
tags=tags,
nindex_levels=len(levels) - sparse_offset,
)
else:
row = []
for i in range(len(frame)):
if is_truncated_vertically and i == (self.fmt.tr_row_num):
str_sep_row = ["..."] * len(row)
self.write_tr(
str_sep_row,
indent,
self.indent_delta,
tags=None,
nindex_levels=self.row_levels,
)
idx_values = list(
zip(
*frame.index._format_multi(sparsify=False, include_names=False),
strict=True,
)
)
row = []
row.extend(idx_values[i])
row.extend(fmt_values[j][i] for j in range(self.ncols))
if is_truncated_horizontally:
row.insert(self.row_levels + self.fmt.tr_col_num, "...")
self.write_tr(
row,
indent,
self.indent_delta,
tags=None,
nindex_levels=frame.index.nlevels,
)
class NotebookFormatter(HTMLFormatter):
"""
Internal class for formatting output data in html for display in Jupyter
Notebooks. This class is intended for functionality specific to
DataFrame._repr_html_() and DataFrame.to_html(notebook=True)
"""
def _get_formatted_values(self) -> dict[int, list[str]]:
return {i: self.fmt.format_col(i) for i in range(self.ncols)}
def _get_columns_formatted_values(self) -> list[str]:
# only reached with non-Multi Index
return self.columns._format_flat(include_name=False)
def write_style(self) -> None:
# We use the "scoped" attribute here so that the desired
# style properties for the data frame are not then applied
# throughout the entire notebook.
template_first = """\
<style scoped>"""
template_last = """\
</style>"""
template_select = """\
.dataframe %s {
%s: %s;
}"""
element_props = [
("tbody tr th:only-of-type", "vertical-align", "middle"),
("tbody tr th", "vertical-align", "top"),
]
if isinstance(self.columns, MultiIndex):
element_props.append(("thead tr th", "text-align", "left"))
if self.show_row_idx_names:
element_props.append(
("thead tr:last-of-type th", "text-align", "right")
)
else:
element_props.append(("thead th", "text-align", "right"))
template_mid = "\n\n".join(template_select % t for t in element_props)
template = dedent(f"{template_first}\n{template_mid}\n{template_last}")
self.write(template)
def render(self) -> list[str]:
self.write("<div>")
self.write_style()
super().render()
self.write("</div>")
return self.elements

View File

@ -0,0 +1,943 @@
from __future__ import annotations
from abc import (
ABC,
abstractmethod,
)
import sys
from textwrap import dedent
from typing import TYPE_CHECKING
from pandas._config import get_option
from pandas.io.formats import format as fmt
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
from collections.abc import (
Iterable,
Iterator,
Mapping,
Sequence,
)
from pandas._typing import (
Dtype,
WriteBuffer,
)
from pandas import (
DataFrame,
Index,
Series,
)
show_counts_sub = dedent(
"""\
show_counts : bool, optional
Whether to show the non-null counts. By default, this is shown
only if the DataFrame is smaller than
``pandas.options.display.max_info_rows`` and
``pandas.options.display.max_info_columns``. A value of True always
shows the counts, and False never shows the counts."""
)
series_examples_sub = dedent(
"""\
>>> int_values = [1, 2, 3, 4, 5]
>>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
>>> s = pd.Series(text_values, index=int_values)
>>> s.info()
<class 'pandas.Series'>
Index: 5 entries, 1 to 5
Series name: None
Non-Null Count Dtype
-------------- -----
5 non-null object
dtypes: object(1)
memory usage: 80.0+ bytes
Prints a summary excluding information about its values:
>>> s.info(verbose=False)
<class 'pandas.Series'>
Index: 5 entries, 1 to 5
dtypes: object(1)
memory usage: 80.0+ bytes
Pipe output of Series.info to buffer instead of sys.stdout, get
buffer content and writes to a text file:
>>> import io
>>> buffer = io.StringIO()
>>> s.info(buf=buffer)
>>> s = buffer.getvalue()
>>> with open("df_info.txt", "w",
... encoding="utf-8") as f: # doctest: +SKIP
... f.write(s)
260
The `memory_usage` parameter allows deep introspection mode, specially
useful for big Series and fine-tune memory optimization:
>>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
>>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6))
>>> s.info()
<class 'pandas.Series'>
RangeIndex: 1000000 entries, 0 to 999999
Series name: None
Non-Null Count Dtype
-------------- -----
1000000 non-null object
dtypes: object(1)
memory usage: 7.6+ MB
>>> s.info(memory_usage='deep')
<class 'pandas.Series'>
RangeIndex: 1000000 entries, 0 to 999999
Series name: None
Non-Null Count Dtype
-------------- -----
1000000 non-null object
dtypes: object(1)
memory usage: 55.3 MB"""
)
series_see_also_sub = dedent(
"""\
Series.describe: Generate descriptive statistics of Series.
Series.memory_usage: Memory usage of Series."""
)
series_max_cols_sub = dedent(
"""\
max_cols : int, optional
Unused, exists only for compatibility with DataFrame.info."""
)
series_sub_kwargs = {
"klass": "Series",
"type_sub": "",
"max_cols_sub": series_max_cols_sub,
"show_counts_sub": show_counts_sub,
"examples_sub": series_examples_sub,
"see_also_sub": series_see_also_sub,
"version_added_sub": "\n.. versionadded:: 1.4.0\n",
}
def _put_str(s: str | Dtype, space: int) -> str:
"""
Make string of specified length, padding to the right if necessary.
Parameters
----------
s : Union[str, Dtype]
String to be formatted.
space : int
Length to force string to be of.
Returns
-------
str
String coerced to given length.
Examples
--------
>>> pd.io.formats.info._put_str("panda", 6)
'panda '
>>> pd.io.formats.info._put_str("panda", 4)
'pand'
"""
return str(s)[:space].ljust(space)
def _sizeof_fmt(num: float, size_qualifier: str) -> str:
"""
Return size in human readable format.
Parameters
----------
num : int
Size in bytes.
size_qualifier : str
Either empty, or '+' (if lower bound).
Returns
-------
str
Size in human readable format.
Examples
--------
>>> _sizeof_fmt(23028, "")
'22.5 KB'
>>> _sizeof_fmt(23028, "+")
'22.5+ KB'
"""
for x in ["bytes", "KB", "MB", "GB", "TB"]:
if num < 1024.0:
return f"{num:3.1f}{size_qualifier} {x}"
num /= 1024.0
return f"{num:3.1f}{size_qualifier} PB"
def _initialize_memory_usage(
memory_usage: bool | str | None = None,
) -> bool | str:
"""Get memory usage based on inputs and display options."""
if memory_usage is None:
memory_usage = get_option("display.memory_usage")
return memory_usage
class _BaseInfo(ABC):
"""
Base class for DataFrameInfo and SeriesInfo.
Parameters
----------
data : DataFrame or Series
Either dataframe or series.
memory_usage : bool or str, optional
If "deep", introspect the data deeply by interrogating object dtypes
for system-level memory consumption, and include it in the returned
values.
"""
data: DataFrame | Series
memory_usage: bool | str
@property
@abstractmethod
def dtypes(self) -> Iterable[Dtype]:
"""
Dtypes.
Returns
-------
dtypes : sequence
Dtype of each of the DataFrame's columns (or one series column).
"""
@property
@abstractmethod
def dtype_counts(self) -> Mapping[str, int]:
"""Mapping dtype - number of counts."""
@property
@abstractmethod
def non_null_counts(self) -> list[int] | Series:
"""Sequence of non-null counts for all columns or column (if series)."""
@property
@abstractmethod
def memory_usage_bytes(self) -> int:
"""
Memory usage in bytes.
Returns
-------
memory_usage_bytes : int
Object's total memory usage in bytes.
"""
@property
def memory_usage_string(self) -> str:
"""Memory usage in a form of human readable string."""
return f"{_sizeof_fmt(self.memory_usage_bytes, self.size_qualifier)}\n"
@property
def size_qualifier(self) -> str:
size_qualifier = ""
if self.memory_usage:
if self.memory_usage != "deep":
# size_qualifier is just a best effort; not guaranteed to catch
# all cases (e.g., it misses categorical data even with object
# categories)
if (
"object" in self.dtype_counts
or self.data.index._is_memory_usage_qualified
):
size_qualifier = "+"
return size_qualifier
@abstractmethod
def render(
self,
*,
buf: WriteBuffer[str] | None,
max_cols: int | None,
verbose: bool | None,
show_counts: bool | None,
) -> None:
pass
class DataFrameInfo(_BaseInfo):
"""
Class storing dataframe-specific info.
"""
def __init__(
self,
data: DataFrame,
memory_usage: bool | str | None = None,
) -> None:
self.data: DataFrame = data
self.memory_usage = _initialize_memory_usage(memory_usage)
@property
def dtype_counts(self) -> Mapping[str, int]:
return _get_dataframe_dtype_counts(self.data)
@property
def dtypes(self) -> Iterable[Dtype]:
"""
Dtypes.
Returns
-------
dtypes
Dtype of each of the DataFrame's columns.
"""
return self.data.dtypes
@property
def ids(self) -> Index:
"""
Column names.
Returns
-------
ids : Index
DataFrame's column names.
"""
return self.data.columns
@property
def col_count(self) -> int:
"""Number of columns to be summarized."""
return len(self.ids)
@property
def non_null_counts(self) -> Series:
"""Sequence of non-null counts for all columns or column (if series)."""
return self.data.count()
@property
def memory_usage_bytes(self) -> int:
deep = self.memory_usage == "deep"
return self.data.memory_usage(index=True, deep=deep).sum()
def render(
self,
*,
buf: WriteBuffer[str] | None,
max_cols: int | None,
verbose: bool | None,
show_counts: bool | None,
) -> None:
printer = _DataFrameInfoPrinter(
info=self,
max_cols=max_cols,
verbose=verbose,
show_counts=show_counts,
)
printer.to_buffer(buf)
class SeriesInfo(_BaseInfo):
"""
Class storing series-specific info.
"""
def __init__(
self,
data: Series,
memory_usage: bool | str | None = None,
) -> None:
self.data: Series = data
self.memory_usage = _initialize_memory_usage(memory_usage)
def render(
self,
*,
buf: WriteBuffer[str] | None = None,
max_cols: int | None = None,
verbose: bool | None = None,
show_counts: bool | None = None,
) -> None:
if max_cols is not None:
raise ValueError(
"Argument `max_cols` can only be passed "
"in DataFrame.info, not Series.info"
)
printer = _SeriesInfoPrinter(
info=self,
verbose=verbose,
show_counts=show_counts,
)
printer.to_buffer(buf)
@property
def non_null_counts(self) -> list[int]:
return [self.data.count()]
@property
def dtypes(self) -> Iterable[Dtype]:
return [self.data.dtypes]
@property
def dtype_counts(self) -> Mapping[str, int]:
from pandas.core.frame import DataFrame
return _get_dataframe_dtype_counts(DataFrame(self.data))
@property
def memory_usage_bytes(self) -> int:
"""Memory usage in bytes.
Returns
-------
memory_usage_bytes : int
Object's total memory usage in bytes.
"""
deep = self.memory_usage == "deep"
return self.data.memory_usage(index=True, deep=deep)
class _InfoPrinterAbstract:
"""
Class for printing dataframe or series info.
"""
def to_buffer(self, buf: WriteBuffer[str] | None = None) -> None:
"""Save dataframe info into buffer."""
table_builder = self._create_table_builder()
lines = table_builder.get_lines()
if buf is None: # pragma: no cover
buf = sys.stdout
fmt.buffer_put_lines(buf, lines)
@abstractmethod
def _create_table_builder(self) -> _TableBuilderAbstract:
"""Create instance of table builder."""
class _DataFrameInfoPrinter(_InfoPrinterAbstract):
"""
Class for printing dataframe info.
Parameters
----------
info : DataFrameInfo
Instance of DataFrameInfo.
max_cols : int, optional
When to switch from the verbose to the truncated output.
verbose : bool, optional
Whether to print the full summary.
show_counts : bool, optional
Whether to show the non-null counts.
"""
def __init__(
self,
info: DataFrameInfo,
max_cols: int | None = None,
verbose: bool | None = None,
show_counts: bool | None = None,
) -> None:
self.info = info
self.data = info.data
self.verbose = verbose
self.max_cols = self._initialize_max_cols(max_cols)
self.show_counts = self._initialize_show_counts(show_counts)
@property
def max_rows(self) -> int:
"""Maximum info rows to be displayed."""
return get_option("display.max_info_rows")
@property
def exceeds_info_cols(self) -> bool:
"""Check if number of columns to be summarized does not exceed maximum."""
return bool(self.col_count > self.max_cols)
@property
def exceeds_info_rows(self) -> bool:
"""Check if number of rows to be summarized does not exceed maximum."""
return bool(len(self.data) > self.max_rows)
@property
def col_count(self) -> int:
"""Number of columns to be summarized."""
return self.info.col_count
def _initialize_max_cols(self, max_cols: int | None) -> int:
if max_cols is None:
return get_option("display.max_info_columns")
return max_cols
def _initialize_show_counts(self, show_counts: bool | None) -> bool:
if show_counts is None:
return bool(not self.exceeds_info_cols and not self.exceeds_info_rows)
else:
return show_counts
def _create_table_builder(self) -> _DataFrameTableBuilder:
"""
Create instance of table builder based on verbosity and display settings.
"""
if self.verbose:
return _DataFrameTableBuilderVerbose(
info=self.info,
with_counts=self.show_counts,
)
elif self.verbose is False: # specifically set to False, not necessarily None
return _DataFrameTableBuilderNonVerbose(info=self.info)
elif self.exceeds_info_cols:
return _DataFrameTableBuilderNonVerbose(info=self.info)
else:
return _DataFrameTableBuilderVerbose(
info=self.info,
with_counts=self.show_counts,
)
class _SeriesInfoPrinter(_InfoPrinterAbstract):
"""Class for printing series info.
Parameters
----------
info : SeriesInfo
Instance of SeriesInfo.
verbose : bool, optional
Whether to print the full summary.
show_counts : bool, optional
Whether to show the non-null counts.
"""
def __init__(
self,
info: SeriesInfo,
verbose: bool | None = None,
show_counts: bool | None = None,
) -> None:
self.info = info
self.data = info.data
self.verbose = verbose
self.show_counts = self._initialize_show_counts(show_counts)
def _create_table_builder(self) -> _SeriesTableBuilder:
"""
Create instance of table builder based on verbosity.
"""
if self.verbose or self.verbose is None:
return _SeriesTableBuilderVerbose(
info=self.info,
with_counts=self.show_counts,
)
else:
return _SeriesTableBuilderNonVerbose(info=self.info)
def _initialize_show_counts(self, show_counts: bool | None) -> bool:
if show_counts is None:
return True
else:
return show_counts
class _TableBuilderAbstract(ABC):
"""
Abstract builder for info table.
"""
_lines: list[str]
info: _BaseInfo
@abstractmethod
def get_lines(self) -> list[str]:
"""Product in a form of list of lines (strings)."""
@property
def data(self) -> DataFrame | Series:
return self.info.data
@property
def dtypes(self) -> Iterable[Dtype]:
"""Dtypes of each of the DataFrame's columns."""
return self.info.dtypes
@property
def dtype_counts(self) -> Mapping[str, int]:
"""Mapping dtype - number of counts."""
return self.info.dtype_counts
@property
def display_memory_usage(self) -> bool:
"""Whether to display memory usage."""
return bool(self.info.memory_usage)
@property
def memory_usage_string(self) -> str:
"""Memory usage string with proper size qualifier."""
return self.info.memory_usage_string
@property
def non_null_counts(self) -> list[int] | Series:
return self.info.non_null_counts
def add_object_type_line(self) -> None:
"""Add line with string representation of dataframe to the table."""
self._lines.append(str(type(self.data)))
def add_index_range_line(self) -> None:
"""Add line with range of indices to the table."""
self._lines.append(self.data.index._summary())
def add_dtypes_line(self) -> None:
"""Add summary line with dtypes present in dataframe."""
collected_dtypes = [
f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items())
]
self._lines.append(f"dtypes: {', '.join(collected_dtypes)}")
class _DataFrameTableBuilder(_TableBuilderAbstract):
"""
Abstract builder for dataframe info table.
Parameters
----------
info : DataFrameInfo.
Instance of DataFrameInfo.
"""
def __init__(self, *, info: DataFrameInfo) -> None:
self.info: DataFrameInfo = info
def get_lines(self) -> list[str]:
self._lines = []
if self.col_count == 0:
self._fill_empty_info()
else:
self._fill_non_empty_info()
return self._lines
def _fill_empty_info(self) -> None:
"""Add lines to the info table, pertaining to empty dataframe."""
self.add_object_type_line()
self.add_index_range_line()
self._lines.append(f"Empty {type(self.data).__name__}\n")
@abstractmethod
def _fill_non_empty_info(self) -> None:
"""Add lines to the info table, pertaining to non-empty dataframe."""
@property
def data(self) -> DataFrame:
"""DataFrame."""
return self.info.data
@property
def ids(self) -> Index:
"""Dataframe columns."""
return self.info.ids
@property
def col_count(self) -> int:
"""Number of dataframe columns to be summarized."""
return self.info.col_count
def add_memory_usage_line(self) -> None:
"""Add line containing memory usage."""
self._lines.append(f"memory usage: {self.memory_usage_string}")
class _DataFrameTableBuilderNonVerbose(_DataFrameTableBuilder):
"""
Dataframe info table builder for non-verbose output.
"""
def _fill_non_empty_info(self) -> None:
"""Add lines to the info table, pertaining to non-empty dataframe."""
self.add_object_type_line()
self.add_index_range_line()
self.add_columns_summary_line()
self.add_dtypes_line()
if self.display_memory_usage:
self.add_memory_usage_line()
def add_columns_summary_line(self) -> None:
self._lines.append(self.ids._summary(name="Columns"))
class _TableBuilderVerboseMixin(_TableBuilderAbstract):
"""
Mixin for verbose info output.
"""
SPACING: str = " " * 2
strrows: Sequence[Sequence[str]]
gross_column_widths: Sequence[int]
with_counts: bool
@property
@abstractmethod
def headers(self) -> Sequence[str]:
"""Headers names of the columns in verbose table."""
@property
def header_column_widths(self) -> Sequence[int]:
"""Widths of header columns (only titles)."""
return [len(col) for col in self.headers]
def _get_gross_column_widths(self) -> Sequence[int]:
"""Get widths of columns containing both headers and actual content."""
body_column_widths = self._get_body_column_widths()
return [
max(*widths)
for widths in zip(
self.header_column_widths, body_column_widths, strict=False
)
]
def _get_body_column_widths(self) -> Sequence[int]:
"""Get widths of table content columns."""
strcols: Sequence[Sequence[str]] = list(zip(*self.strrows, strict=True))
return [max(len(x) for x in col) for col in strcols]
def _gen_rows(self) -> Iterator[Sequence[str]]:
"""
Generator function yielding rows content.
Each element represents a row comprising a sequence of strings.
"""
if self.with_counts:
return self._gen_rows_with_counts()
else:
return self._gen_rows_without_counts()
@abstractmethod
def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
"""Iterator with string representation of body data with counts."""
@abstractmethod
def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
"""Iterator with string representation of body data without counts."""
def add_header_line(self) -> None:
header_line = self.SPACING.join(
[
_put_str(header, col_width)
for header, col_width in zip(
self.headers, self.gross_column_widths, strict=True
)
]
)
self._lines.append(header_line)
def add_separator_line(self) -> None:
separator_line = self.SPACING.join(
[
_put_str("-" * header_colwidth, gross_colwidth)
for header_colwidth, gross_colwidth in zip(
self.header_column_widths, self.gross_column_widths, strict=True
)
]
)
self._lines.append(separator_line)
def add_body_lines(self) -> None:
for row in self.strrows:
body_line = self.SPACING.join(
[
_put_str(col, gross_colwidth)
for col, gross_colwidth in zip(
row, self.gross_column_widths, strict=True
)
]
)
self._lines.append(body_line)
def _gen_non_null_counts(self) -> Iterator[str]:
"""Iterator with string representation of non-null counts."""
for count in self.non_null_counts:
yield f"{count} non-null"
def _gen_dtypes(self) -> Iterator[str]:
"""Iterator with string representation of column dtypes."""
for dtype in self.dtypes:
yield pprint_thing(dtype)
class _DataFrameTableBuilderVerbose(_DataFrameTableBuilder, _TableBuilderVerboseMixin):
"""
Dataframe info table builder for verbose output.
"""
def __init__(
self,
*,
info: DataFrameInfo,
with_counts: bool,
) -> None:
self.info = info
self.with_counts = with_counts
self.strrows: Sequence[Sequence[str]] = list(self._gen_rows())
self.gross_column_widths: Sequence[int] = self._get_gross_column_widths()
def _fill_non_empty_info(self) -> None:
"""Add lines to the info table, pertaining to non-empty dataframe."""
self.add_object_type_line()
self.add_index_range_line()
self.add_columns_summary_line()
self.add_header_line()
self.add_separator_line()
self.add_body_lines()
self.add_dtypes_line()
if self.display_memory_usage:
self.add_memory_usage_line()
@property
def headers(self) -> Sequence[str]:
"""Headers names of the columns in verbose table."""
if self.with_counts:
return [" # ", "Column", "Non-Null Count", "Dtype"]
return [" # ", "Column", "Dtype"]
def add_columns_summary_line(self) -> None:
self._lines.append(f"Data columns (total {self.col_count} columns):")
def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
"""Iterator with string representation of body data without counts."""
yield from zip(
self._gen_line_numbers(),
self._gen_columns(),
self._gen_dtypes(),
strict=True,
)
def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
"""Iterator with string representation of body data with counts."""
yield from zip(
self._gen_line_numbers(),
self._gen_columns(),
self._gen_non_null_counts(),
self._gen_dtypes(),
strict=True,
)
def _gen_line_numbers(self) -> Iterator[str]:
"""Iterator with string representation of column numbers."""
for i, _ in enumerate(self.ids):
yield f" {i}"
def _gen_columns(self) -> Iterator[str]:
"""Iterator with string representation of column names."""
for col in self.ids:
yield pprint_thing(col)
class _SeriesTableBuilder(_TableBuilderAbstract):
"""
Abstract builder for series info table.
Parameters
----------
info : SeriesInfo.
Instance of SeriesInfo.
"""
def __init__(self, *, info: SeriesInfo) -> None:
self.info: SeriesInfo = info
def get_lines(self) -> list[str]:
self._lines = []
self._fill_non_empty_info()
return self._lines
@property
def data(self) -> Series:
"""Series."""
return self.info.data
def add_memory_usage_line(self) -> None:
"""Add line containing memory usage."""
self._lines.append(f"memory usage: {self.memory_usage_string}")
@abstractmethod
def _fill_non_empty_info(self) -> None:
"""Add lines to the info table, pertaining to non-empty series."""
class _SeriesTableBuilderNonVerbose(_SeriesTableBuilder):
"""
Series info table builder for non-verbose output.
"""
def _fill_non_empty_info(self) -> None:
"""Add lines to the info table, pertaining to non-empty series."""
self.add_object_type_line()
self.add_index_range_line()
self.add_dtypes_line()
if self.display_memory_usage:
self.add_memory_usage_line()
class _SeriesTableBuilderVerbose(_SeriesTableBuilder, _TableBuilderVerboseMixin):
"""
Series info table builder for verbose output.
"""
def __init__(
self,
*,
info: SeriesInfo,
with_counts: bool,
) -> None:
self.info = info
self.with_counts = with_counts
self.strrows: Sequence[Sequence[str]] = list(self._gen_rows())
self.gross_column_widths: Sequence[int] = self._get_gross_column_widths()
def _fill_non_empty_info(self) -> None:
"""Add lines to the info table, pertaining to non-empty series."""
self.add_object_type_line()
self.add_index_range_line()
self.add_series_name_line()
self.add_header_line()
self.add_separator_line()
self.add_body_lines()
self.add_dtypes_line()
if self.display_memory_usage:
self.add_memory_usage_line()
def add_series_name_line(self) -> None:
self._lines.append(f"Series name: {self.data.name}")
@property
def headers(self) -> Sequence[str]:
"""Headers names of the columns in verbose table."""
if self.with_counts:
return ["Non-Null Count", "Dtype"]
return ["Dtype"]
def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
"""Iterator with string representation of body data without counts."""
yield from ([dtype] for dtype in self._gen_dtypes())
def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
"""Iterator with string representation of body data with counts."""
yield from zip(self._gen_non_null_counts(), self._gen_dtypes(), strict=True)
def _get_dataframe_dtype_counts(df: DataFrame) -> Mapping[str, int]:
"""
Create mapping between datatypes and their number of occurrences.
"""
# groupby dtype.name to collect e.g. Categorical columns
return df.dtypes.value_counts().groupby(lambda x: x.name).sum()

View File

@ -0,0 +1,587 @@
"""
Printing tools.
"""
from __future__ import annotations
from collections.abc import (
Callable,
Iterable,
Mapping,
Sequence,
)
import sys
from typing import (
TYPE_CHECKING,
Any,
TypeAlias,
TypeVar,
)
from unicodedata import east_asian_width
from pandas._config import get_option
from pandas.core.dtypes.inference import is_sequence
from pandas.io.formats.console import get_console_size
if TYPE_CHECKING:
from pandas._typing import ListLike
EscapeChars: TypeAlias = Mapping[str, str] | Iterable[str]
_KT = TypeVar("_KT")
_VT = TypeVar("_VT")
def adjoin(space: int, *lists: list[str], **kwargs: Any) -> str:
"""
Glues together two sets of strings using the amount of space requested.
The idea is to prettify.
----------
space : int
number of spaces for padding
lists : str
list of str which being joined
strlen : callable
function used to calculate the length of each str. Needed for unicode
handling.
justfunc : callable
function used to justify str. Needed for unicode handling.
"""
strlen = kwargs.pop("strlen", len)
justfunc = kwargs.pop("justfunc", _adj_justify)
newLists = []
lengths = [max(map(strlen, x)) + space for x in lists[:-1]]
# not the last one
lengths.append(max(map(len, lists[-1])))
maxLen = max(map(len, lists))
for i, lst in enumerate(lists):
nl = justfunc(lst, lengths[i], mode="left")
nl = ([" " * lengths[i]] * (maxLen - len(lst))) + nl
newLists.append(nl)
toJoin = zip(*newLists, strict=True)
return "\n".join("".join(lines) for lines in toJoin)
def _adj_justify(texts: Iterable[str], max_len: int, mode: str = "right") -> list[str]:
"""
Perform ljust, center, rjust against string or list-like
"""
if mode == "left":
return [x.ljust(max_len) for x in texts]
elif mode == "center":
return [x.center(max_len) for x in texts]
else:
return [x.rjust(max_len) for x in texts]
# Unicode consolidation
# ---------------------
#
# pprinting utility functions for generating Unicode text or
# bytes(3.x)/str(2.x) representations of objects.
# Try to use these as much as possible rather than rolling your own.
#
# When to use
# -----------
#
# 1) If you're writing code internal to pandas (no I/O directly involved),
# use pprint_thing().
#
# It will always return unicode text which can handled by other
# parts of the package without breakage.
#
# 2) if you need to write something out to file, use
# pprint_thing_encoded(encoding).
#
# If no encoding is specified, it defaults to utf-8. Since encoding pure
# ascii with utf-8 is a no-op you can safely use the default utf-8 if you're
# working with straight ascii.
def _pprint_seq(
seq: ListLike, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds: Any
) -> str:
"""
internal. pprinter for iterables. you should probably use pprint_thing()
rather than calling this directly.
bounds length of printed sequence, depending on options
"""
if isinstance(seq, set):
fmt = "{{{body}}}"
elif isinstance(seq, frozenset):
fmt = "frozenset({{{body}}})"
else:
fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})"
if max_seq_items is False:
max_items = None
else:
max_items = max_seq_items or get_option("max_seq_items") or len(seq)
s = iter(seq)
# handle sets, no slicing
r = []
max_items_reached = False
for i, item in enumerate(s):
if (max_items is not None) and (i >= max_items):
max_items_reached = True
break
r.append(pprint_thing(item, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds))
body = ", ".join(r)
if max_items_reached:
body += ", ..."
elif isinstance(seq, tuple) and len(seq) == 1:
body += ","
return fmt.format(body=body)
def _pprint_dict(
seq: Mapping, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds: Any
) -> str:
"""
internal. pprinter for iterables. you should probably use pprint_thing()
rather than calling this directly.
"""
fmt = "{{{things}}}"
pairs = []
pfmt = "{key}: {val}"
if max_seq_items is False:
nitems = len(seq)
else:
nitems = max_seq_items or get_option("max_seq_items") or len(seq)
for k, v in list(seq.items())[:nitems]:
pairs.append(
pfmt.format(
key=pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
val=pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
)
)
if nitems < len(seq):
return fmt.format(things=", ".join(pairs) + ", ...")
else:
return fmt.format(things=", ".join(pairs))
def pprint_thing(
thing: object,
_nest_lvl: int = 0,
escape_chars: EscapeChars | None = None,
default_escapes: bool = False,
quote_strings: bool = False,
max_seq_items: int | None = None,
) -> str:
"""
This function is the sanctioned way of converting objects
to a string representation and properly handles nested sequences.
Parameters
----------
thing : anything to be formatted
_nest_lvl : internal use only. pprint_thing() is mutually-recursive
with pprint_sequence, this argument is used to keep track of the
current nesting level, and limit it.
escape_chars : list[str] or Mapping[str, str], optional
Characters to escape. If a Mapping is passed the values are the
replacements
default_escapes : bool, default False
Whether the input escape characters replaces or adds to the defaults
max_seq_items : int or None, default None
Pass through to other pretty printers to limit sequence printing
Returns
-------
str
"""
def as_escaped_string(
thing: Any, escape_chars: EscapeChars | None = escape_chars
) -> str:
translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r", "'": r"\'"}
if isinstance(escape_chars, Mapping):
if default_escapes:
translate.update(escape_chars)
else:
translate = escape_chars # type: ignore[assignment]
escape_chars = list(escape_chars.keys())
else:
escape_chars = escape_chars or ()
result = str(thing)
for c in escape_chars:
result = result.replace(c, translate[c])
return result
if hasattr(thing, "__next__"):
return str(thing)
elif isinstance(thing, Mapping) and _nest_lvl < get_option(
"display.pprint_nest_depth"
):
result = _pprint_dict(
thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items
)
elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"):
result = _pprint_seq(
# error: Argument 1 to "_pprint_seq" has incompatible type "object";
# expected "ExtensionArray | ndarray[Any, Any] | Index | Series |
# SequenceNotStr[Any] | range"
thing, # type: ignore[arg-type]
_nest_lvl,
escape_chars=escape_chars,
quote_strings=quote_strings,
max_seq_items=max_seq_items,
)
elif isinstance(thing, str) and quote_strings:
result = f"'{as_escaped_string(thing)}'"
else:
result = as_escaped_string(thing)
return result
def pprint_thing_encoded(
object: object, encoding: str = "utf-8", errors: str = "replace"
) -> bytes:
value = pprint_thing(object) # get unicode representation of object
return value.encode(encoding, errors)
def enable_data_resource_formatter(enable: bool) -> None:
if "IPython" not in sys.modules:
# definitely not in IPython
return
from IPython import get_ipython
# error: Call to untyped function "get_ipython" in typed context
ip = get_ipython() # type: ignore[no-untyped-call]
if ip is None:
# still not in IPython
return
formatters = ip.display_formatter.formatters
mimetype = "application/vnd.dataresource+json"
if enable:
if mimetype not in formatters:
# define tableschema formatter
from IPython.core.formatters import BaseFormatter
from traitlets import ObjectName
class TableSchemaFormatter(BaseFormatter):
print_method = ObjectName("_repr_data_resource_")
_return_type = (dict,)
# register it:
formatters[mimetype] = TableSchemaFormatter()
# enable it if it's been disabled:
formatters[mimetype].enabled = True
# unregister tableschema mime-type
elif mimetype in formatters:
formatters[mimetype].enabled = False
def default_pprint(thing: Any, max_seq_items: int | None = None) -> str:
return pprint_thing(
thing,
escape_chars=("\t", "\r", "\n"),
quote_strings=True,
max_seq_items=max_seq_items,
)
def format_object_summary(
obj: ListLike,
formatter: Callable,
is_justify: bool = True,
name: str | None = None,
indent_for_name: bool = True,
line_break_each_value: bool = False,
) -> str:
"""
Return the formatted obj as a unicode string
Parameters
----------
obj : object
must be iterable and support __getitem__
formatter : callable
string formatter for an element
is_justify : bool
should justify the display
name : name, optional
defaults to the class name of the obj
indent_for_name : bool, default True
Whether subsequent lines should be indented to
align with the name.
line_break_each_value : bool, default False
If True, inserts a line break for each value of ``obj``.
If False, only break lines when the a line of values gets wider
than the display width.
Returns
-------
summary string
"""
display_width, _ = get_console_size()
if display_width is None:
display_width = get_option("display.width") or 80
if name is None:
name = type(obj).__name__
if indent_for_name:
name_len = len(name)
space1 = f"\n{(' ' * (name_len + 1))}"
space2 = f"\n{(' ' * (name_len + 2))}"
else:
space1 = "\n"
space2 = "\n " # space for the opening '['
n = len(obj)
if line_break_each_value:
# If we want to vertically align on each value of obj, we need to
# separate values by a line break and indent the values
sep = ",\n " + " " * len(name)
else:
sep = ","
max_seq_items = get_option("display.max_seq_items") or n
# are we a truncated display
is_truncated = n > max_seq_items
# adj can optionally handle unicode eastern asian width
adj = get_adjustment()
def _extend_line(
s: str, line: str, value: str, display_width: int, next_line_prefix: str
) -> tuple[str, str]:
if adj.len(line.rstrip()) + adj.len(value.rstrip()) >= display_width:
s += line.rstrip()
line = next_line_prefix
line += value
return s, line
def best_len(values: list[str]) -> int:
if values:
return max(adj.len(x) for x in values)
else:
return 0
close = ", "
if n == 0:
summary = f"[]{close}"
elif n == 1 and not line_break_each_value:
first = formatter(obj[0])
summary = f"[{first}]{close}"
elif n == 2 and not line_break_each_value:
first = formatter(obj[0])
last = formatter(obj[-1])
summary = f"[{first}, {last}]{close}"
else:
if max_seq_items == 1:
# If max_seq_items=1 show only last element
head = []
tail = [formatter(x) for x in obj[-1:]]
elif n > max_seq_items:
n = min(max_seq_items // 2, 10)
head = [formatter(x) for x in obj[:n]]
tail = [formatter(x) for x in obj[-n:]]
else:
head = []
tail = [formatter(x) for x in obj]
# adjust all values to max length if needed
if is_justify:
if line_break_each_value:
# Justify each string in the values of head and tail, so the
# strings will right align when head and tail are stacked
# vertically.
head, tail = _justify(head, tail)
elif is_truncated or not (
len(", ".join(head)) < display_width
and len(", ".join(tail)) < display_width
):
# Each string in head and tail should align with each other
max_length = max(best_len(head), best_len(tail))
head = [x.rjust(max_length) for x in head]
tail = [x.rjust(max_length) for x in tail]
# If we are not truncated and we are only a single
# line, then don't justify
if line_break_each_value:
# Now head and tail are of type List[Tuple[str]]. Below we
# convert them into List[str], so there will be one string per
# value. Also truncate items horizontally if wider than
# max_space
max_space = display_width - len(space2)
value = tail[0]
max_items = 1
for num_items in reversed(range(1, len(value) + 1)):
pprinted_seq = _pprint_seq(value, max_seq_items=num_items)
if len(pprinted_seq) < max_space:
max_items = num_items
break
head = [_pprint_seq(x, max_seq_items=max_items) for x in head]
tail = [_pprint_seq(x, max_seq_items=max_items) for x in tail]
summary = ""
line = space2
for head_value in head:
word = head_value + sep + " "
summary, line = _extend_line(summary, line, word, display_width, space2)
if is_truncated:
# remove trailing space of last line
summary += line.rstrip() + space2 + "..."
line = space2
for tail_item in tail[:-1]:
word = tail_item + sep + " "
summary, line = _extend_line(summary, line, word, display_width, space2)
# last value: no sep added + 1 space of width used for trailing ','
summary, line = _extend_line(summary, line, tail[-1], display_width - 2, space2)
summary += line
# right now close is either '' or ', '
# Now we want to include the ']', but not the maybe space.
close = "]" + close.rstrip(" ")
summary += close
if len(summary) > (display_width) or line_break_each_value:
summary += space1
else: # one row
summary += " "
# remove initial space
summary = "[" + summary[len(space2) :]
return summary
def _justify(
head: list[Sequence[str]], tail: list[Sequence[str]]
) -> tuple[list[tuple[str, ...]], list[tuple[str, ...]]]:
"""
Justify items in head and tail, so they are right-aligned when stacked.
Parameters
----------
head : list-like of list-likes of strings
tail : list-like of list-likes of strings
Returns
-------
tuple of list of tuples of strings
Same as head and tail, but items are right aligned when stacked
vertically.
Examples
--------
>>> _justify([["a", "b"]], [["abc", "abcd"]])
([(' a', ' b')], [('abc', 'abcd')])
"""
combined = head + tail
# For each position for the sequences in ``combined``,
# find the length of the largest string.
max_length = [0] * len(combined[0])
for inner_seq in combined:
length = [len(item) for item in inner_seq]
max_length = [max(x, y) for x, y in zip(max_length, length, strict=True)]
# justify each item in each list-like in head and tail using max_length
head_tuples = [
tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length, strict=True))
for seq in head
]
tail_tuples = [
tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length, strict=True))
for seq in tail
]
return head_tuples, tail_tuples
class PrettyDict(dict[_KT, _VT]):
"""Dict extension to support abbreviated __repr__"""
def __repr__(self) -> str:
return pprint_thing(self)
class _TextAdjustment:
def __init__(self) -> None:
self.encoding = get_option("display.encoding")
def len(self, text: str) -> int:
return len(text)
def justify(self, texts: Any, max_len: int, mode: str = "right") -> list[str]:
"""
Perform ljust, center, rjust against string or list-like
"""
if mode == "left":
return [x.ljust(max_len) for x in texts]
elif mode == "center":
return [x.center(max_len) for x in texts]
else:
return [x.rjust(max_len) for x in texts]
def adjoin(self, space: int, *lists: Any, **kwargs: Any) -> str:
return adjoin(space, *lists, strlen=self.len, justfunc=self.justify, **kwargs)
class _EastAsianTextAdjustment(_TextAdjustment):
def __init__(self) -> None:
super().__init__()
if get_option("display.unicode.ambiguous_as_wide"):
self.ambiguous_width = 2
else:
self.ambiguous_width = 1
# Definition of East Asian Width
# https://unicode.org/reports/tr11/
# Ambiguous width can be changed by option
self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1}
def len(self, text: str) -> int:
"""
Calculate display width considering unicode East Asian Width
"""
if not isinstance(text, str):
return len(text)
return sum(
self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text
)
def justify(
self, texts: Iterable[str], max_len: int, mode: str = "right"
) -> list[str]:
# re-calculate padding space per str considering East Asian Width
def _get_pad(t: str) -> int:
return max_len - self.len(t) + len(t)
if mode == "left":
return [x.ljust(_get_pad(x)) for x in texts]
elif mode == "center":
return [x.center(_get_pad(x)) for x in texts]
else:
return [x.rjust(_get_pad(x)) for x in texts]
def get_adjustment() -> _TextAdjustment:
use_east_asian_width = get_option("display.unicode.east_asian_width")
if use_east_asian_width:
return _EastAsianTextAdjustment()
else:
return _TextAdjustment()

View File

@ -0,0 +1,207 @@
"""
Module for formatting output data in console (to string).
"""
from __future__ import annotations
from shutil import get_terminal_size
from typing import TYPE_CHECKING
import numpy as np
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
from collections.abc import Iterable
from pandas.io.formats.format import DataFrameFormatter
class StringFormatter:
"""Formatter for string representation of a dataframe."""
def __init__(self, fmt: DataFrameFormatter, line_width: int | None = None) -> None:
self.fmt = fmt
self.adj = fmt.adj
self.frame = fmt.frame
self.line_width = line_width
def to_string(self) -> str:
text = self._get_string_representation()
if self.fmt.should_show_dimensions:
text = f"{text}{self.fmt.dimensions_info}"
return text
def _get_strcols(self) -> list[list[str]]:
strcols = self.fmt.get_strcols()
if self.fmt.is_truncated:
strcols = self._insert_dot_separators(strcols)
return strcols
def _get_string_representation(self) -> str:
if self.fmt.frame.empty:
return self._empty_info_line
strcols = self._get_strcols()
if self.line_width is None:
# no need to wrap around just print the whole frame
return self.adj.adjoin(1, *strcols)
if self._need_to_wrap_around:
return self._join_multiline(strcols)
return self._fit_strcols_to_terminal_width(strcols)
@property
def _empty_info_line(self) -> str:
return (
f"Empty {type(self.frame).__name__}\n"
f"Columns: {pprint_thing(self.frame.columns)}\n"
f"Index: {pprint_thing(self.frame.index)}"
)
@property
def _need_to_wrap_around(self) -> bool:
return bool(self.fmt.max_cols is None or self.fmt.max_cols > 0)
def _insert_dot_separators(self, strcols: list[list[str]]) -> list[list[str]]:
str_index = self.fmt._get_formatted_index(self.fmt.tr_frame)
index_length = len(str_index)
if self.fmt.is_truncated_horizontally:
strcols = self._insert_dot_separator_horizontal(strcols, index_length)
if self.fmt.is_truncated_vertically:
strcols = self._insert_dot_separator_vertical(strcols, index_length)
return strcols
@property
def _adjusted_tr_col_num(self) -> int:
return self.fmt.tr_col_num + 1 if self.fmt.index else self.fmt.tr_col_num
def _insert_dot_separator_horizontal(
self, strcols: list[list[str]], index_length: int
) -> list[list[str]]:
strcols.insert(self._adjusted_tr_col_num, [" ..."] * index_length)
return strcols
def _insert_dot_separator_vertical(
self, strcols: list[list[str]], index_length: int
) -> list[list[str]]:
n_header_rows = index_length - len(self.fmt.tr_frame)
row_num = self.fmt.tr_row_num
for ix, col in enumerate(strcols):
cwidth = self.adj.len(col[row_num])
if self.fmt.is_truncated_horizontally:
is_dot_col = ix == self._adjusted_tr_col_num
else:
is_dot_col = False
if cwidth > 3 or is_dot_col:
dots = "..."
else:
dots = ".."
if ix == 0 and self.fmt.index:
dot_mode = "left"
elif is_dot_col:
cwidth = 4
dot_mode = "right"
else:
dot_mode = "right"
dot_str = self.adj.justify([dots], cwidth, mode=dot_mode)[0]
col.insert(row_num + n_header_rows, dot_str)
return strcols
def _join_multiline(self, strcols_input: Iterable[list[str]]) -> str:
lwidth = self.line_width
adjoin_width = 1
strcols = list(strcols_input)
if self.fmt.index:
idx = strcols.pop(0)
lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width
col_widths = [
np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0
for col in strcols
]
assert lwidth is not None
col_bins = _binify(col_widths, lwidth)
nbins = len(col_bins)
str_lst = []
start = 0
for i, end in enumerate(col_bins):
row = strcols[start:end]
if self.fmt.index:
row.insert(0, idx)
if nbins > 1:
nrows = len(row[-1])
if end <= len(strcols) and i < nbins - 1:
row.append([" \\"] + [" "] * (nrows - 1))
else:
row.append([" "] * nrows)
str_lst.append(self.adj.adjoin(adjoin_width, *row))
start = end
return "\n\n".join(str_lst)
def _fit_strcols_to_terminal_width(self, strcols: list[list[str]]) -> str:
from pandas import Series
lines = self.adj.adjoin(1, *strcols).split("\n")
max_len = Series(lines).str.len().max()
# plus truncate dot col
width, _ = get_terminal_size()
dif = max_len - width
# '+ 1' to avoid too wide repr (GH PR #17023)
adj_dif = dif + 1
col_lens = Series([Series(ele).str.len().max() for ele in strcols])
n_cols = len(col_lens)
counter = 0
while adj_dif > 0 and n_cols > 1:
counter += 1
mid = round(n_cols / 2)
mid_ix = col_lens.index[mid]
col_len = col_lens[mid_ix]
# adjoin adds one
adj_dif -= col_len + 1
col_lens = col_lens.drop(mid_ix)
n_cols = len(col_lens)
# subtract index column
max_cols_fitted = n_cols - self.fmt.index
# GH-21180. Ensure that we print at least two.
max_cols_fitted = max(max_cols_fitted, 2)
self.fmt.max_cols_fitted = max_cols_fitted
# Call again _truncate to cut frame appropriately
# and then generate string representation
self.fmt.truncate()
strcols = self._get_strcols()
return self.adj.adjoin(1, *strcols)
def _binify(cols: list[int], line_width: int) -> list[int]:
adjoin_width = 1
bins = []
curr_width = 0
i_last_column = len(cols) - 1
for i, w in enumerate(cols):
w_adjoined = w + adjoin_width
curr_width += w_adjoined
if i_last_column == i:
wrap = curr_width + 1 > line_width and i > 0
else:
wrap = curr_width + 2 > line_width and i > 0
if wrap:
bins.append(i)
curr_width = w_adjoined
bins.append(len(cols))
return bins

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,16 @@
{# Update the html_style/table_structure.html documentation too #}
{% if doctype_html %}
<!DOCTYPE html>
<html>
<head>
<meta charset="{{encoding}}">
{% if not exclude_styles %}{% include html_style_tpl %}{% endif %}
</head>
<body>
{% include html_table_tpl %}
</body>
</html>
{% elif not doctype_html %}
{% if not exclude_styles %}{% include html_style_tpl %}{% endif %}
{% include html_table_tpl %}
{% endif %}

View File

@ -0,0 +1,26 @@
{%- block before_style -%}{%- endblock before_style -%}
{% block style %}
<style type="text/css">
{% block table_styles %}
{% for s in table_styles %}
#T_{{uuid}} {{s.selector}} {
{% for p,val in s.props %}
{{p}}: {{val}};
{% endfor %}
}
{% endfor %}
{% endblock table_styles %}
{% block before_cellstyle %}{% endblock before_cellstyle %}
{% block cellstyle %}
{% for cs in [cellstyle, cellstyle_index, cellstyle_columns] %}
{% for s in cs %}
{% for selector in s.selectors %}{% if not loop.first %}, {% endif %}#T_{{uuid}}_{{selector}}{% endfor %} {
{% for p,val in s.props %}
{{p}}: {{val}};
{% endfor %}
}
{% endfor %}
{% endfor %}
{% endblock cellstyle %}
</style>
{% endblock style %}

View File

@ -0,0 +1,63 @@
{% block before_table %}{% endblock before_table %}
{% block table %}
{% if exclude_styles %}
<table>
{% else %}
<table id="T_{{uuid}}"{% if table_attributes %} {{table_attributes}}{% endif %}>
{% endif %}
{% block caption %}
{% if caption and caption is string %}
<caption>{{caption}}</caption>
{% elif caption and caption is sequence %}
<caption>{{caption[0]}}</caption>
{% endif %}
{% endblock caption %}
{% block thead %}
<thead>
{% block before_head_rows %}{% endblock %}
{% for r in head %}
{% block head_tr scoped %}
<tr>
{% if exclude_styles %}
{% for c in r %}
{% if c.is_visible != False %}
<{{c.type}} {{c.attributes}}>{{c.display_value}}</{{c.type}}>
{% endif %}
{% endfor %}
{% else %}
{% for c in r %}
{% if c.is_visible != False %}
<{{c.type}} {%- if c.id is defined %} id="T_{{uuid}}_{{c.id}}" {%- endif %} class="{{c.class}}" {{c.attributes}}>{{c.display_value}}</{{c.type}}>
{% endif %}
{% endfor %}
{% endif %}
</tr>
{% endblock head_tr %}
{% endfor %}
{% block after_head_rows %}{% endblock %}
</thead>
{% endblock thead %}
{% block tbody %}
<tbody>
{% block before_rows %}{% endblock before_rows %}
{% for r in body %}
{% block tr scoped %}
<tr>
{% if exclude_styles %}
{% for c in r %}{% if c.is_visible != False %}
<{{c.type}} {{c.attributes}}>{{c.display_value}}</{{c.type}}>
{% endif %}{% endfor %}
{% else %}
{% for c in r %}{% if c.is_visible != False %}
<{{c.type}} {%- if c.id is defined %} id="T_{{uuid}}_{{c.id}}" {%- endif %} class="{{c.class}}" {{c.attributes}}>{{c.display_value}}</{{c.type}}>
{% endif %}{% endfor %}
{% endif %}
</tr>
{% endblock tr %}
{% endfor %}
{% block after_rows %}{% endblock after_rows %}
</tbody>
{% endblock tbody %}
</table>
{% endblock table %}
{% block after_table %}{% endblock after_table %}

View File

@ -0,0 +1,5 @@
{% if environment == "longtable" %}
{% include "latex_longtable.tpl" %}
{% else %}
{% include "latex_table.tpl" %}
{% endif %}

View File

@ -0,0 +1,82 @@
\begin{longtable}
{%- set position = parse_table(table_styles, 'position') %}
{%- if position is not none %}
[{{position}}]
{%- endif %}
{%- set column_format = parse_table(table_styles, 'column_format') %}
{% raw %}{{% endraw %}{{column_format}}{% raw %}}{% endraw %}
{% for style in table_styles %}
{% if style['selector'] not in ['position', 'position_float', 'caption', 'toprule', 'midrule', 'bottomrule', 'column_format', 'label'] %}
\{{style['selector']}}{{parse_table(table_styles, style['selector'])}}
{% endif %}
{% endfor %}
{% if caption and caption is string %}
\caption{% raw %}{{% endraw %}{{caption}}{% raw %}}{% endraw %}
{%- set label = parse_table(table_styles, 'label') %}
{%- if label is not none %}
\label{{label}}
{%- endif %} \\
{% elif caption and caption is sequence %}
\caption[{{caption[1]}}]{% raw %}{{% endraw %}{{caption[0]}}{% raw %}}{% endraw %}
{%- set label = parse_table(table_styles, 'label') %}
{%- if label is not none %}
\label{{label}}
{%- endif %} \\
{% else %}
{%- set label = parse_table(table_styles, 'label') %}
{%- if label is not none %}
\label{{label}} \\
{% endif %}
{% endif %}
{% set toprule = parse_table(table_styles, 'toprule') %}
{% if toprule is not none %}
\{{toprule}}
{% endif %}
{% for row in head %}
{% for c in row %}{%- if not loop.first %} & {% endif %}{{parse_header(c, multirow_align, multicol_align, siunitx)}}{% endfor %} \\
{% endfor %}
{% set midrule = parse_table(table_styles, 'midrule') %}
{% if midrule is not none %}
\{{midrule}}
{% endif %}
\endfirsthead
{% if caption and caption is string %}
\caption[]{% raw %}{{% endraw %}{{caption}}{% raw %}}{% endraw %} \\
{% elif caption and caption is sequence %}
\caption[]{% raw %}{{% endraw %}{{caption[0]}}{% raw %}}{% endraw %} \\
{% endif %}
{% if toprule is not none %}
\{{toprule}}
{% endif %}
{% for row in head %}
{% for c in row %}{%- if not loop.first %} & {% endif %}{{parse_header(c, multirow_align, multicol_align, siunitx)}}{% endfor %} \\
{% endfor %}
{% if midrule is not none %}
\{{midrule}}
{% endif %}
\endhead
{% if midrule is not none %}
\{{midrule}}
{% endif %}
\multicolumn{% raw %}{{% endraw %}{{body[0]|length}}{% raw %}}{% endraw %}{r}{Continued on next page} \\
{% if midrule is not none %}
\{{midrule}}
{% endif %}
\endfoot
{% set bottomrule = parse_table(table_styles, 'bottomrule') %}
{% if bottomrule is not none %}
\{{bottomrule}}
{% endif %}
\endlastfoot
{% for row in body %}
{% for c in row %}{% if not loop.first %} & {% endif %}
{%- if c.type == 'th' %}{{parse_header(c, multirow_align, multicol_align)}}{% else %}{{parse_cell(c.cellstyle, c.display_value, convert_css)}}{% endif %}
{%- endfor %} \\
{% if clines and clines[loop.index] | length > 0 %}
{%- for cline in clines[loop.index] %}{% if not loop.first %} {% endif %}{{ cline }}{% endfor %}
{% endif %}
{% endfor %}
\end{longtable}
{% raw %}{% endraw %}

View File

@ -0,0 +1,57 @@
{% if environment or parse_wrap(table_styles, caption) %}
\begin{% raw %}{{% endraw %}{{environment if environment else "table"}}{% raw %}}{% endraw %}
{%- set position = parse_table(table_styles, 'position') %}
{%- if position is not none %}
[{{position}}]
{%- endif %}
{% set position_float = parse_table(table_styles, 'position_float') %}
{% if position_float is not none%}
\{{position_float}}
{% endif %}
{% if caption and caption is string %}
\caption{% raw %}{{% endraw %}{{caption}}{% raw %}}{% endraw %}
{% elif caption and caption is sequence %}
\caption[{{caption[1]}}]{% raw %}{{% endraw %}{{caption[0]}}{% raw %}}{% endraw %}
{% endif %}
{% for style in table_styles %}
{% if style['selector'] not in ['position', 'position_float', 'caption', 'toprule', 'midrule', 'bottomrule', 'column_format'] %}
\{{style['selector']}}{{parse_table(table_styles, style['selector'])}}
{% endif %}
{% endfor %}
{% endif %}
\begin{tabular}
{%- set column_format = parse_table(table_styles, 'column_format') %}
{% raw %}{{% endraw %}{{column_format}}{% raw %}}{% endraw %}
{% set toprule = parse_table(table_styles, 'toprule') %}
{% if toprule is not none %}
\{{toprule}}
{% endif %}
{% for row in head %}
{% for c in row %}{%- if not loop.first %} & {% endif %}{{parse_header(c, multirow_align, multicol_align, siunitx, convert_css)}}{% endfor %} \\
{% endfor %}
{% set midrule = parse_table(table_styles, 'midrule') %}
{% if midrule is not none %}
\{{midrule}}
{% endif %}
{% for row in body %}
{% for c in row %}{% if not loop.first %} & {% endif %}
{%- if c.type == 'th' %}{{parse_header(c, multirow_align, multicol_align, False, convert_css)}}{% else %}{{parse_cell(c.cellstyle, c.display_value, convert_css)}}{% endif %}
{%- endfor %} \\
{% if clines and clines[loop.index] | length > 0 %}
{%- for cline in clines[loop.index] %}{% if not loop.first %} {% endif %}{{ cline }}{% endfor %}
{% endif %}
{% endfor %}
{% set bottomrule = parse_table(table_styles, 'bottomrule') %}
{% if bottomrule is not none %}
\{{bottomrule}}
{% endif %}
\end{tabular}
{% if environment or parse_wrap(table_styles, caption) %}
\end{% raw %}{{% endraw %}{{environment if environment else "table"}}{% raw %}}{% endraw %}
{% endif %}

View File

@ -0,0 +1,12 @@
{% for r in head %}
{% for c in r %}{% if c["is_visible"] %}
{{ c["display_value"] }}{% if not loop.last %}{{ delimiter }}{% endif %}
{% endif %}{% endfor %}
{% endfor %}
{% for r in body %}
{% for c in r %}{% if c["is_visible"] %}
{{ c["display_value"] }}{% if not loop.last %}{{ delimiter }}{% endif %}
{% endif %}{% endfor %}
{% endfor %}

View File

@ -0,0 +1,12 @@
#table(
columns: {{ head[0] | length }},
{% for r in head %}
{% for c in r %}[{% if c["is_visible"] %}{{ c["display_value"] }}{% endif %}],{% if not loop.last %} {% endif%}{% endfor %}
{% endfor %}
{% for r in body %}
{% for c in r %}[{% if c["is_visible"] %}{{ c["display_value"] }}{% endif %}],{% if not loop.last %} {% endif%}{% endfor %}
{% endfor %}
)

View File

@ -0,0 +1,566 @@
"""
:mod:`pandas.io.formats.xml` is a module for formatting data in XML.
"""
from __future__ import annotations
import codecs
import io
from typing import (
TYPE_CHECKING,
Any,
final,
)
from pandas.errors import AbstractMethodError
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.missing import isna
from pandas.io.common import get_handle
from pandas.io.xml import get_data_from_filepath
if TYPE_CHECKING:
from pandas._typing import (
CompressionOptions,
FilePath,
ReadBuffer,
StorageOptions,
WriteBuffer,
)
from pandas import DataFrame
class _BaseXMLFormatter:
"""
Subclass for formatting data in XML.
Parameters
----------
path_or_buffer : str or file-like
This can be either a string of raw XML, a valid URL,
file or file-like object.
index : bool
Whether to include index in xml document.
row_name : str
Name for root of xml document. Default is 'data'.
root_name : str
Name for row elements of xml document. Default is 'row'.
na_rep : str
Missing data representation.
attrs_cols : list
List of columns to write as attributes in row element.
elem_cols : list
List of columns to write as children in row element.
namespaces : dict
The namespaces to define in XML document as dicts with key
being namespace and value the URI.
prefix : str
The prefix for each element in XML document including root.
encoding : str
Encoding of xml object or document.
xml_declaration : bool
Whether to include xml declaration at top line item in xml.
pretty_print : bool
Whether to write xml document with line breaks and indentation.
stylesheet : str or file-like
A URL, file, file-like object, or a raw string containing XSLT.
compression : str or dict, default 'infer'
For on-the-fly compression of the output data. If 'infer' and 'path_or_buffer'
is path-like, then detect compression from the following extensions: '.gz',
'.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
(otherwise no compression).
Set to ``None`` for no compression.
Can also be a dict with key ``'method'`` set
to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``}
and other key-value pairs are forwarded to
``zipfile.ZipFile``, ``gzip.GzipFile``,
``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or
``tarfile.TarFile``, respectively.
As an example, the following could be passed for faster compression and to
create a reproducible gzip archive:
``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
storage_options : dict, optional
Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
are forwarded to ``urllib.request.Request`` as header options. For other
URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
details, and for more examples on storage options refer `here
<https://pandas.pydata.org/docs/user_guide/io.html?
highlight=storage_options#reading-writing-remote-files>`_.
See also
--------
pandas.io.formats.xml.EtreeXMLFormatter
pandas.io.formats.xml.LxmlXMLFormatter
"""
def __init__(
self,
frame: DataFrame,
path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
index: bool = True,
root_name: str | None = "data",
row_name: str | None = "row",
na_rep: str | None = None,
attr_cols: list[str] | None = None,
elem_cols: list[str] | None = None,
namespaces: dict[str | None, str] | None = None,
prefix: str | None = None,
encoding: str = "utf-8",
xml_declaration: bool | None = True,
pretty_print: bool | None = True,
stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,
compression: CompressionOptions = "infer",
storage_options: StorageOptions | None = None,
) -> None:
self.frame = frame
self.path_or_buffer = path_or_buffer
self.index = index
self.root_name = root_name
self.row_name = row_name
self.na_rep = na_rep
self.attr_cols = attr_cols
self.elem_cols = elem_cols
self.namespaces = namespaces
self.prefix = prefix
self.encoding = encoding
self.xml_declaration = xml_declaration
self.pretty_print = pretty_print
self.stylesheet = stylesheet
self.compression: CompressionOptions = compression
self.storage_options = storage_options
self.orig_cols = self.frame.columns.tolist()
self.frame_dicts = self._process_dataframe()
self._validate_columns()
self._validate_encoding()
self.prefix_uri = self._get_prefix_uri()
self._handle_indexes()
def _build_tree(self) -> bytes:
"""
Build tree from data.
This method initializes the root and builds attributes and elements
with optional namespaces.
"""
raise AbstractMethodError(self)
@final
def _validate_columns(self) -> None:
"""
Validate elems_cols and attrs_cols.
This method will check if columns is list-like.
Raises
------
ValueError
* If value is not a list and less then length of nodes.
"""
if self.attr_cols and not is_list_like(self.attr_cols):
raise TypeError(
f"{type(self.attr_cols).__name__} is not a valid type for attr_cols"
)
if self.elem_cols and not is_list_like(self.elem_cols):
raise TypeError(
f"{type(self.elem_cols).__name__} is not a valid type for elem_cols"
)
@final
def _validate_encoding(self) -> None:
"""
Validate encoding.
This method will check if encoding is among listed under codecs.
Raises
------
LookupError
* If encoding is not available in codecs.
"""
codecs.lookup(self.encoding)
@final
def _process_dataframe(self) -> dict[int | str, dict[str, Any]]:
"""
Adjust Data Frame to fit xml output.
This method will adjust underlying data frame for xml output,
including optionally replacing missing values and including indexes.
"""
df = self.frame
if self.index:
df = df.reset_index()
if self.na_rep is not None:
df = df.fillna(self.na_rep)
return df.to_dict(orient="index")
@final
def _handle_indexes(self) -> None:
"""
Handle indexes.
This method will add indexes into attr_cols or elem_cols.
"""
if not self.index:
return
first_key = next(iter(self.frame_dicts))
indexes: list[str] = [
x for x in self.frame_dicts[first_key].keys() if x not in self.orig_cols
]
if self.attr_cols:
self.attr_cols = indexes + self.attr_cols
if self.elem_cols:
self.elem_cols = indexes + self.elem_cols
def _get_prefix_uri(self) -> str:
"""
Get uri of namespace prefix.
This method retrieves corresponding URI to prefix in namespaces.
Raises
------
KeyError
*If prefix is not included in namespace dict.
"""
raise AbstractMethodError(self)
@final
def _other_namespaces(self) -> dict:
"""
Define other namespaces.
This method will build dictionary of namespaces attributes
for root element, conditionally with optional namespaces and
prefix.
"""
nmsp_dict: dict[str, str] = {}
if self.namespaces:
nmsp_dict = {
f"xmlns{p if p == '' else f':{p}'}": n
for p, n in self.namespaces.items()
if n != self.prefix_uri[1:-1]
}
return nmsp_dict
@final
def _build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any:
"""
Create attributes of row.
This method adds attributes using attr_cols to row element and
works with tuples for multindex or hierarchical columns.
"""
if not self.attr_cols:
return elem_row
for col in self.attr_cols:
attr_name = self._get_flat_col_name(col)
try:
if not isna(d[col]):
elem_row.attrib[attr_name] = str(d[col])
except KeyError as err:
raise KeyError(f"no valid column, {col}") from err
return elem_row
@final
def _get_flat_col_name(self, col: str | tuple) -> str:
flat_col = col
if isinstance(col, tuple):
flat_col = (
"".join([str(c) for c in col]).strip()
if "" in col
else "_".join([str(c) for c in col]).strip()
)
return f"{self.prefix_uri}{flat_col}"
@cache_readonly
def _sub_element_cls(self):
raise AbstractMethodError(self)
@final
def _build_elems(self, d: dict[str, Any], elem_row: Any) -> None:
"""
Create child elements of row.
This method adds child elements using elem_cols to row element and
works with tuples for multindex or hierarchical columns.
"""
sub_element_cls = self._sub_element_cls
if not self.elem_cols:
return
for col in self.elem_cols:
elem_name = self._get_flat_col_name(col)
try:
val = None if isna(d[col]) or d[col] == "" else str(d[col])
sub_element_cls(elem_row, elem_name).text = val
except KeyError as err:
raise KeyError(f"no valid column, {col}") from err
@final
def write_output(self) -> str | None:
xml_doc = self._build_tree()
if self.path_or_buffer is not None:
with get_handle(
self.path_or_buffer,
"wb",
compression=self.compression,
storage_options=self.storage_options,
is_text=False,
) as handles:
handles.handle.write(xml_doc)
return None
else:
return xml_doc.decode(self.encoding).rstrip()
class EtreeXMLFormatter(_BaseXMLFormatter):
"""
Class for formatting data in xml using Python standard library
modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
"""
def _build_tree(self) -> bytes:
from xml.etree.ElementTree import (
Element,
SubElement,
tostring,
)
self.root = Element(
f"{self.prefix_uri}{self.root_name}", attrib=self._other_namespaces()
)
for d in self.frame_dicts.values():
elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
if not self.attr_cols and not self.elem_cols:
self.elem_cols = list(d.keys())
self._build_elems(d, elem_row)
else:
elem_row = self._build_attribs(d, elem_row)
self._build_elems(d, elem_row)
self.out_xml = tostring(
self.root,
method="xml",
encoding=self.encoding,
xml_declaration=self.xml_declaration,
)
if self.pretty_print:
self.out_xml = self._prettify_tree()
if self.stylesheet is not None:
raise ValueError(
"To use stylesheet, you need lxml installed and selected as parser."
)
return self.out_xml
def _get_prefix_uri(self) -> str:
from xml.etree.ElementTree import register_namespace
uri = ""
if self.namespaces:
for p, n in self.namespaces.items():
if isinstance(p, str) and isinstance(n, str):
register_namespace(p, n)
if self.prefix:
try:
uri = f"{{{self.namespaces[self.prefix]}}}"
except KeyError as err:
raise KeyError(
f"{self.prefix} is not included in namespaces"
) from err
elif "" in self.namespaces:
uri = f"{{{self.namespaces['']}}}"
else:
uri = ""
return uri
@cache_readonly
def _sub_element_cls(self):
from xml.etree.ElementTree import SubElement
return SubElement
def _prettify_tree(self) -> bytes:
"""
Output tree for pretty print format.
This method will pretty print xml with line breaks and indentation.
"""
from xml.dom.minidom import parseString
dom = parseString(self.out_xml)
return dom.toprettyxml(indent=" ", encoding=self.encoding)
class LxmlXMLFormatter(_BaseXMLFormatter):
"""
Class for formatting data in xml using Python standard library
modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
"""
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self._convert_empty_str_key()
def _build_tree(self) -> bytes:
"""
Build tree from data.
This method initializes the root and builds attributes and elements
with optional namespaces.
"""
from lxml.etree import (
Element,
SubElement,
tostring,
)
self.root = Element(f"{self.prefix_uri}{self.root_name}", nsmap=self.namespaces)
for d in self.frame_dicts.values():
elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
if not self.attr_cols and not self.elem_cols:
self.elem_cols = list(d.keys())
self._build_elems(d, elem_row)
else:
elem_row = self._build_attribs(d, elem_row)
self._build_elems(d, elem_row)
self.out_xml = tostring(
self.root,
pretty_print=self.pretty_print,
method="xml",
encoding=self.encoding,
xml_declaration=self.xml_declaration,
)
if self.stylesheet is not None:
self.out_xml = self._transform_doc()
return self.out_xml
def _convert_empty_str_key(self) -> None:
"""
Replace zero-length string in `namespaces`.
This method will replace '' with None to align to `lxml`
requirement that empty string prefixes are not allowed.
"""
if self.namespaces and "" in self.namespaces.keys():
self.namespaces[None] = self.namespaces.pop("", "default")
def _get_prefix_uri(self) -> str:
uri = ""
if self.namespaces:
if self.prefix:
try:
uri = f"{{{self.namespaces[self.prefix]}}}"
except KeyError as err:
raise KeyError(
f"{self.prefix} is not included in namespaces"
) from err
elif "" in self.namespaces:
uri = f"{{{self.namespaces['']}}}"
else:
uri = ""
return uri
@cache_readonly
def _sub_element_cls(self):
from lxml.etree import SubElement
return SubElement
def _transform_doc(self) -> bytes:
"""
Parse stylesheet from file or buffer and run it.
This method will parse stylesheet object into tree for parsing
conditionally by its specific object type, then transforms
original tree with XSLT script.
"""
from lxml.etree import (
XSLT,
XMLParser,
fromstring,
parse,
)
style_doc = self.stylesheet
assert style_doc is not None # is ensured by caller
handle_data = get_data_from_filepath(
filepath_or_buffer=style_doc,
encoding=self.encoding,
compression=self.compression,
storage_options=self.storage_options,
)
with handle_data as xml_data:
curr_parser = XMLParser(encoding=self.encoding)
if isinstance(xml_data, io.StringIO):
xsl_doc = fromstring(
xml_data.getvalue().encode(self.encoding), parser=curr_parser
)
else:
xsl_doc = parse(xml_data, parser=curr_parser)
transformer = XSLT(xsl_doc)
new_doc = transformer(self.root)
return bytes(new_doc)