#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Public testing utility functions.
"""
from typing import Literal, Union
import pyspark.pandas as ps
try:
from pyspark.sql.pandas.utils import require_minimum_pandas_version
require_minimum_pandas_version()
import pandas as pd
except ImportError:
pass
[docs]def assert_frame_equal(
left: Union[ps.DataFrame, pd.DataFrame],
right: Union[ps.DataFrame, pd.DataFrame],
check_dtype: bool = True,
check_index_type: Union[bool, Literal["equiv"]] = "equiv",
check_column_type: Union[bool, Literal["equiv"]] = "equiv",
check_frame_type: bool = True,
check_names: bool = True,
by_blocks: bool = False,
check_exact: bool = False,
check_datetimelike_compat: bool = False,
check_categorical: bool = True,
check_like: bool = False,
check_freq: bool = True,
check_flags: bool = True,
rtol: float = 1.0e-5,
atol: float = 1.0e-8,
obj: str = "DataFrame",
) -> None:
"""
Check that left and right DataFrame are equal.
This function is intended to compare two DataFrames and output any
differences. It is mostly intended for use in unit tests.
Additional parameters allow varying the strictness of the
equality checks performed.
.. versionadded:: 4.0.0
Parameters
----------
left : DataFrame
First DataFrame to compare.
right : DataFrame
Second DataFrame to compare.
check_dtype : bool, default True
Whether to check the DataFrame dtype is identical.
check_index_type : bool or {'equiv'}, default 'equiv'
Whether to check the Index class, dtype and inferred_type
are identical.
check_column_type : bool or {'equiv'}, default 'equiv'
Whether to check the columns class, dtype and inferred_type
are identical. Is passed as the ``exact`` argument of
:func:`assert_index_equal`.
check_frame_type : bool, default True
Whether to check the DataFrame class is identical.
check_names : bool, default True
Whether to check that the `names` attribute for both the `index`
and `column` attributes of the DataFrame is identical.
by_blocks : bool, default False
Specify how to compare internal data. If False, compare by columns.
If True, compare by blocks.
check_exact : bool, default False
Whether to compare number exactly.
check_datetimelike_compat : bool, default False
Compare datetime-like which is comparable ignoring dtype.
check_categorical : bool, default True
Whether to compare internal Categorical exactly.
check_like : bool, default False
If True, ignore the order of index & columns.
Note: index labels must match their respective rows
(same as in columns) - same labels must be with the same data.
check_freq : bool, default True
Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex.
check_flags : bool, default True
Whether to check the `flags` attribute.
rtol : float, default 1e-5
Relative tolerance. Only used when check_exact is False.
atol : float, default 1e-8
Absolute tolerance. Only used when check_exact is False.
obj : str, default 'DataFrame'
Specify object name being compared, internally used to show appropriate
assertion message.
See Also
--------
assert_series_equal : Equivalent method for asserting Series equality.
DataFrame.equals : Check DataFrame equality.
Examples
--------
This example shows comparing two DataFrames that are equal
but with columns of differing dtypes.
>>> from pyspark.pandas.testing import assert_frame_equal
>>> df1 = ps.DataFrame({'a': [1, 2], 'b': [3, 4]})
>>> df2 = ps.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]})
df1 equals itself.
>>> assert_frame_equal(df1, df1)
df1 differs from df2 as column 'b' is of a different type.
>>> assert_frame_equal(df1, df2)
Traceback (most recent call last):
...
AssertionError: Attributes of DataFrame.iloc[:, 1] (column name="b") are different
<BLANKLINE>
Attribute "dtype" are different
[left]: int64
[right]: float64
Ignore differing dtypes in columns with check_dtype.
>>> assert_frame_equal(df1, df2, check_dtype=False)
"""
if isinstance(left, ps.DataFrame):
left = left.to_pandas()
if isinstance(right, ps.DataFrame):
right = right.to_pandas()
pd.testing.assert_frame_equal(
left,
right,
check_dtype=check_dtype,
check_index_type=check_index_type, # type: ignore[arg-type]
check_column_type=check_column_type, # type: ignore[arg-type]
check_frame_type=check_frame_type,
check_names=check_names,
by_blocks=by_blocks,
check_exact=check_exact,
check_datetimelike_compat=check_datetimelike_compat,
check_categorical=check_categorical,
check_like=check_like,
check_freq=check_freq,
check_flags=check_flags,
rtol=rtol,
atol=atol,
obj=obj,
)
[docs]def assert_series_equal(
left: Union[ps.Series, pd.Series],
right: Union[ps.Series, pd.Series],
check_dtype: bool = True,
check_index_type: Union[bool, Literal["equiv"]] = "equiv",
check_series_type: bool = True,
check_names: bool = True,
check_exact: bool = False,
check_datetimelike_compat: bool = False,
check_categorical: bool = True,
check_category_order: bool = True,
check_freq: bool = True,
check_flags: bool = True,
rtol: float = 1.0e-5,
atol: float = 1.0e-8,
obj: str = "Series",
*,
check_index: bool = True,
check_like: bool = False,
) -> None:
"""
Check that left and right Series are equal.
.. versionadded:: 4.0.0
Parameters
----------
left : Series
right : Series
check_dtype : bool, default True
Whether to check the Series dtype is identical.
check_index_type : bool or {'equiv'}, default 'equiv'
Whether to check the Index class, dtype and inferred_type
are identical.
check_series_type : bool, default True
Whether to check the Series class is identical.
check_names : bool, default True
Whether to check the Series and Index names attribute.
check_exact : bool, default False
Whether to compare number exactly.
check_datetimelike_compat : bool, default False
Compare datetime-like which is comparable ignoring dtype.
check_categorical : bool, default True
Whether to compare internal Categorical exactly.
check_category_order : bool, default True
Whether to compare category order of internal Categoricals.
check_freq : bool, default True
Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex.
check_flags : bool, default True
Whether to check the `flags` attribute.
rtol : float, default 1e-5
Relative tolerance. Only used when check_exact is False.
atol : float, default 1e-8
Absolute tolerance. Only used when check_exact is False.
obj : str, default 'Series'
Specify object name being compared, internally used to show appropriate
assertion message.
check_index : bool, default True
Whether to check index equivalence. If False, then compare only values.
check_like : bool, default False
If True, ignore the order of the index. Must be False if check_index is False.
Note: same labels must be with the same data.
Examples
--------
>>> from pyspark.pandas import testing as tm
>>> a = ps.Series([1, 2, 3, 4])
>>> b = ps.Series([1, 2, 3, 4])
>>> tm.assert_series_equal(a, b)
"""
if isinstance(left, ps.Series):
left = left.to_pandas()
if isinstance(right, ps.Series):
right = right.to_pandas()
pd.testing.assert_series_equal( # type: ignore[call-arg]
left,
right,
check_dtype=check_dtype,
check_index_type=check_index_type, # type: ignore[arg-type]
check_series_type=check_series_type,
check_names=check_names,
check_exact=check_exact,
check_datetimelike_compat=check_datetimelike_compat,
check_categorical=check_categorical,
check_category_order=check_category_order,
check_freq=check_freq,
check_flags=check_flags,
rtol=rtol, # type: ignore[arg-type]
atol=atol, # type: ignore[arg-type]
obj=obj,
check_index=check_index,
check_like=check_like,
)
[docs]def assert_index_equal(
left: Union[ps.Index, pd.Index],
right: Union[ps.Index, pd.Index],
exact: Union[bool, Literal["equiv"]] = "equiv",
check_names: bool = True,
check_exact: bool = True,
check_categorical: bool = True,
check_order: bool = True,
rtol: float = 1.0e-5,
atol: float = 1.0e-8,
obj: str = "Index",
) -> None:
"""
Check that left and right Index are equal.
.. versionadded:: 4.0.0
Parameters
----------
left : Index
right : Index
exact : bool or {'equiv'}, default 'equiv'
Whether to check the Index class, dtype and inferred_type
are identical. If 'equiv', then RangeIndex can be substituted for
Index with an int64 dtype as well.
check_names : bool, default True
Whether to check the names attribute.
check_exact : bool, default True
Whether to compare number exactly.
check_categorical : bool, default True
Whether to compare internal Categorical exactly.
check_order : bool, default True
Whether to compare the order of index entries as well as their values.
If True, both indexes must contain the same elements, in the same order.
If False, both indexes must contain the same elements, but in any order.
rtol : float, default 1e-5
Relative tolerance. Only used when check_exact is False.
atol : float, default 1e-8
Absolute tolerance. Only used when check_exact is False.
obj : str, default 'Index'
Specify object name being compared, internally used to show appropriate
assertion message.
Examples
--------
>>> from pyspark.pandas import testing as tm
>>> a = ps.Index([1, 2, 3])
>>> b = ps.Index([1, 2, 3])
>>> tm.assert_index_equal(a, b)
"""
if isinstance(left, ps.Index):
left = left.to_pandas()
if isinstance(right, ps.Index):
right = right.to_pandas()
pd.testing.assert_index_equal( # type: ignore[call-arg]
left,
right,
exact=exact,
check_names=check_names,
check_exact=check_exact,
check_categorical=check_categorical,
check_order=check_order,
rtol=rtol,
atol=atol,
obj=obj,
)