Source code for pyspark.ml.functions

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import annotations

import inspect
import uuid
from typing import Any, Callable, Iterator, List, Mapping, TYPE_CHECKING, Tuple, Union, Optional

import numpy as np

try:
    import pandas as pd
except ImportError:
    pass  # Let it throw a better error message later when the API is invoked.

from pyspark.sql.functions import pandas_udf
from pyspark.sql.column import Column
from pyspark.sql.types import (
    ArrayType,
    ByteType,
    DataType,
    DoubleType,
    FloatType,
    IntegerType,
    LongType,
    ShortType,
    StringType,
    StructType,
)
from pyspark.ml.util import try_remote_functions

if TYPE_CHECKING:
    from pyspark.sql._typing import UserDefinedFunctionLike

supported_scalar_types = (
    ByteType,
    ShortType,
    IntegerType,
    LongType,
    FloatType,
    DoubleType,
    StringType,
)

# Callable type for end user predict functions that take a variable number of ndarrays as
# input and returns one of the following as output:
# - single ndarray (single output)
# - dictionary of named ndarrays (multiple outputs represented in columnar form)
# - list of dictionaries of named ndarrays (multiple outputs represented in row form)
PredictBatchFunction = Callable[
    [np.ndarray], Union[np.ndarray, Mapping[str, np.ndarray], List[Mapping[str, np.dtype]]]
]


[docs]@try_remote_functions def vector_to_array(col: Column, dtype: str = "float64") -> Column: """ Converts a column of MLlib sparse/dense vectors into a column of dense arrays. .. versionadded:: 3.0.0 .. versionchanged:: 3.5.0 Supports Spark Connect. Parameters ---------- col : :py:class:`pyspark.sql.Column` or str Input column dtype : str, optional The data type of the output array. Valid values: "float64" or "float32". Returns ------- :py:class:`pyspark.sql.Column` The converted column of dense arrays. Examples -------- >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.functions import vector_to_array >>> from pyspark.mllib.linalg import Vectors as OldVectors >>> df = spark.createDataFrame([ ... (Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)), ... (Vectors.sparse(3, [(0, 2.0), (2, 3.0)]), ... OldVectors.sparse(3, [(0, 20.0), (2, 30.0)]))], ... ["vec", "oldVec"]) >>> df1 = df.select(vector_to_array("vec").alias("vec"), ... vector_to_array("oldVec").alias("oldVec")) >>> df1.collect() [Row(vec=[1.0, 2.0, 3.0], oldVec=[10.0, 20.0, 30.0]), Row(vec=[2.0, 0.0, 3.0], oldVec=[20.0, 0.0, 30.0])] >>> df2 = df.select(vector_to_array("vec", "float32").alias("vec"), ... vector_to_array("oldVec", "float32").alias("oldVec")) >>> df2.collect() [Row(vec=[1.0, 2.0, 3.0], oldVec=[10.0, 20.0, 30.0]), Row(vec=[2.0, 0.0, 3.0], oldVec=[20.0, 0.0, 30.0])] >>> df1.schema.fields [StructField('vec', ArrayType(DoubleType(), False), False), StructField('oldVec', ArrayType(DoubleType(), False), False)] >>> df2.schema.fields [StructField('vec', ArrayType(FloatType(), False), False), StructField('oldVec', ArrayType(FloatType(), False), False)] """ from pyspark.core.context import SparkContext from pyspark.sql.classic.column import Column, _to_java_column sc = SparkContext._active_spark_context assert sc is not None and sc._jvm is not None return Column( sc._jvm.org.apache.spark.ml.functions.vector_to_array(_to_java_column(col), dtype) )
[docs]@try_remote_functions def array_to_vector(col: Column) -> Column: """ Converts a column of array of numeric type into a column of pyspark.ml.linalg.DenseVector instances .. versionadded:: 3.1.0 .. versionchanged:: 3.5.0 Supports Spark Connect. Parameters ---------- col : :py:class:`pyspark.sql.Column` or str Input column Returns ------- :py:class:`pyspark.sql.Column` The converted column of dense vectors. Examples -------- >>> from pyspark.ml.functions import array_to_vector >>> df1 = spark.createDataFrame([([1.5, 2.5],),], schema='v1 array<double>') >>> df1.select(array_to_vector('v1').alias('vec1')).collect() [Row(vec1=DenseVector([1.5, 2.5]))] >>> df2 = spark.createDataFrame([([1.5, 3.5],),], schema='v1 array<float>') >>> df2.select(array_to_vector('v1').alias('vec1')).collect() [Row(vec1=DenseVector([1.5, 3.5]))] >>> df3 = spark.createDataFrame([([1, 3],),], schema='v1 array<int>') >>> df3.select(array_to_vector('v1').alias('vec1')).collect() [Row(vec1=DenseVector([1.0, 3.0]))] """ from pyspark.core.context import SparkContext from pyspark.sql.classic.column import Column, _to_java_column sc = SparkContext._active_spark_context assert sc is not None and sc._jvm is not None return Column(sc._jvm.org.apache.spark.ml.functions.array_to_vector(_to_java_column(col)))
def _batched( data: Union[pd.Series, pd.DataFrame, Tuple[pd.Series]], batch_size: int ) -> Iterator[pd.DataFrame]: """Generator that splits a pandas dataframe/series into batches.""" if isinstance(data, pd.DataFrame): df = data elif isinstance(data, pd.Series): df = pd.concat((data,), axis=1) else: # isinstance(data, Tuple[pd.Series]): df = pd.concat(data, axis=1) index = 0 data_size = len(df) while index < data_size: yield df.iloc[index : index + batch_size] index += batch_size def _is_tensor_col(data: Union[pd.Series, pd.DataFrame]) -> bool: if isinstance(data, pd.Series): return data.dtype == np.object_ and isinstance(data.iloc[0], (np.ndarray, list)) elif isinstance(data, pd.DataFrame): return any(data.dtypes == np.object_) and any( [isinstance(d, (np.ndarray, list)) for d in data.iloc[0]] ) else: raise ValueError( "Unexpected data type: {}, expected pd.Series or pd.DataFrame.".format(type(data)) ) def _has_tensor_cols(data: Union[pd.Series, pd.DataFrame, Tuple[pd.Series]]) -> bool: """Check if input Series/DataFrame/Tuple contains any tensor-valued columns.""" if isinstance(data, (pd.Series, pd.DataFrame)): return _is_tensor_col(data) else: # isinstance(data, Tuple): return any(_is_tensor_col(elem) for elem in data) def _validate_and_transform_multiple_inputs( batch: pd.DataFrame, input_shapes: List[Optional[List[int]]], num_input_cols: int ) -> List[np.ndarray]: multi_inputs = [batch[col].to_numpy() for col in batch.columns] if input_shapes: if len(input_shapes) == num_input_cols: multi_inputs = [ np.vstack(v).reshape([-1] + input_shapes[i]) # type: ignore if input_shapes[i] else v for i, v in enumerate(multi_inputs) ] if not all([len(x) == len(batch) for x in multi_inputs]): raise ValueError("Input data does not match expected shape.") else: raise ValueError("input_tensor_shapes must match columns") return multi_inputs def _validate_and_transform_single_input( batch: pd.DataFrame, input_shapes: List[List[int] | None], has_tensors: bool, has_tuple: bool, ) -> np.ndarray: # multiple input columns for single expected input if has_tensors: # tensor columns if len(batch.columns) == 1: # one tensor column and one expected input, vstack rows single_input = np.vstack(batch.iloc[:, 0]) else: raise ValueError( "Multiple input columns found, but model expected a single " "input, use `array` to combine columns into tensors." ) else: # scalar columns if len(batch.columns) == 1: # single scalar column, remove extra dim np_batch = batch.to_numpy() single_input = np.squeeze(np_batch, -1) if len(np_batch.shape) > 1 else np_batch if input_shapes and input_shapes[0] not in [None, [], [1]]: raise ValueError("Invalid input_tensor_shape for scalar column.") elif not has_tuple: # columns grouped via `array`, convert to single tensor single_input = batch.to_numpy() if input_shapes and input_shapes[0] != [len(batch.columns)]: raise ValueError("Input data does not match expected shape.") else: raise ValueError( "Multiple input columns found, but model expected a single " "input, use `array` to combine columns into tensors." ) # if input_tensor_shapes provided, try to reshape input if input_shapes: if len(input_shapes) == 1: single_input = single_input.reshape([-1] + input_shapes[0]) # type: ignore if len(single_input) != len(batch): raise ValueError("Input data does not match expected shape.") else: raise ValueError("Multiple input_tensor_shapes found, but model expected one input") return single_input def _validate_and_transform_prediction_result( preds: np.ndarray | Mapping[str, np.ndarray] | List[Mapping[str, Any]], num_input_rows: int, return_type: DataType, ) -> pd.DataFrame | pd.Series: """Validate numpy-based model predictions against the expected pandas_udf return_type and transforms the predictions into an equivalent pandas DataFrame or Series.""" if isinstance(return_type, StructType): struct_rtype: StructType = return_type fieldNames = struct_rtype.names if isinstance(preds, dict): # dictionary of columns predNames = list(preds.keys()) for field in struct_rtype.fields: if isinstance(field.dataType, ArrayType): if len(preds[field.name].shape) == 2: preds[field.name] = list(preds[field.name]) else: raise ValueError( "Prediction results for ArrayType must be two-dimensional." ) elif isinstance(field.dataType, supported_scalar_types): if len(preds[field.name].shape) != 1: raise ValueError( "Prediction results for scalar types must be one-dimensional." ) else: raise ValueError("Unsupported field type in return struct type.") if len(preds[field.name]) != num_input_rows: raise ValueError("Prediction results must have same length as input data") elif isinstance(preds, list) and isinstance(preds[0], dict): # rows of dictionaries predNames = list(preds[0].keys()) if len(preds) != num_input_rows: raise ValueError("Prediction results must have same length as input data.") for field in struct_rtype.fields: if isinstance(field.dataType, ArrayType): if len(preds[0][field.name].shape) != 1: raise ValueError( "Prediction results for ArrayType must be one-dimensional." ) elif isinstance(field.dataType, supported_scalar_types): if not np.isscalar(preds[0][field.name]): raise ValueError("Invalid scalar prediction result.") else: raise ValueError("Unsupported field type in return struct type.") else: raise ValueError( "Prediction results for StructType must be a dictionary or " "a list of dictionary, got: {}".format(type(preds)) ) # check column names if set(predNames) != set(fieldNames): raise ValueError( "Prediction result columns did not match expected return_type " "columns: expected {}, got: {}".format(fieldNames, predNames) ) return pd.DataFrame(preds) elif isinstance(return_type, ArrayType): if isinstance(preds, np.ndarray): if len(preds) != num_input_rows: raise ValueError("Prediction results must have same length as input data.") if len(preds.shape) != 2: raise ValueError("Prediction results for ArrayType must be two-dimensional.") else: raise ValueError("Prediction results for ArrayType must be an ndarray.") return pd.Series(list(preds)) elif isinstance(return_type, supported_scalar_types): preds_array: np.ndarray = preds # type: ignore if len(preds_array) != num_input_rows: raise ValueError("Prediction results must have same length as input data.") if not ( (len(preds_array.shape) == 2 and preds_array.shape[1] == 1) or len(preds_array.shape) == 1 ): raise ValueError("Invalid shape for scalar prediction result.") output = np.squeeze(preds_array, -1) if len(preds_array.shape) > 1 else preds_array return pd.Series(output).astype(output.dtype) else: raise ValueError("Unsupported return type")
[docs]def predict_batch_udf( make_predict_fn: Callable[ [], PredictBatchFunction, ], *, return_type: DataType, batch_size: int, input_tensor_shapes: Optional[Union[List[Optional[List[int]]], Mapping[int, List[int]]]] = None, ) -> UserDefinedFunctionLike: """Given a function which loads a model and returns a `predict` function for inference over a batch of numpy inputs, returns a Pandas UDF wrapper for inference over a Spark DataFrame. The returned Pandas UDF does the following on each DataFrame partition: * calls the `make_predict_fn` to load the model and cache its `predict` function. * batches the input records as numpy arrays and invokes `predict` on each batch. Note: this assumes that the `make_predict_fn` encapsulates all of the necessary dependencies for running the model, or the Spark executor environment already satisfies all runtime requirements. For the conversion of the Spark DataFrame to numpy arrays, there is a one-to-one mapping between the input arguments of the `predict` function (returned by the `make_predict_fn`) and the input columns sent to the Pandas UDF (returned by the `predict_batch_udf`) at runtime. Each input column will be converted as follows: * scalar column -> 1-dim np.ndarray * tensor column + tensor shape -> N-dim np.ndarray Note that any tensor columns in the Spark DataFrame must be represented as a flattened one-dimensional array, and multiple scalar columns can be combined into a single tensor column using the standard :py:func:`pyspark.sql.functions.array()` function. .. versionadded:: 3.4.0 Parameters ---------- make_predict_fn : callable Function which is responsible for loading a model and returning a :py:class:`PredictBatchFunction` which takes one or more numpy arrays as input and returns one of the following: * a numpy array (for a single output) * a dictionary of named numpy arrays (for multiple outputs) * a row-oriented list of dictionaries (for multiple outputs). For a dictionary of named numpy arrays, the arrays can only be one or two dimensional, since higher dimensional arrays are not supported. For a row-oriented list of dictionaries, each element in the dictionary must be either a scalar or one-dimensional array. return_type : :py:class:`pyspark.sql.types.DataType` or str. Spark SQL datatype for the expected output: * Scalar (e.g. IntegerType, FloatType) --> 1-dim numpy array. * ArrayType --> 2-dim numpy array. * StructType --> dict with keys matching struct fields. * StructType --> list of dict with keys matching struct fields, for models like the `Huggingface pipeline for sentiment analysis <https://huggingface.co/docs/transformers/quicktour#pipeline-usage>`_. batch_size : int Batch size to use for inference. This is typically a limitation of the model and/or available hardware resources and is usually smaller than the Spark partition size. input_tensor_shapes : list, dict, optional. A list of ints or a dictionary of ints (key) and list of ints (value). Input tensor shapes for models with tensor inputs. This can be a list of shapes, where each shape is a list of integers or None (for scalar inputs). Alternatively, this can be represented by a "sparse" dictionary, where the keys are the integer indices of the inputs, and the values are the shapes. Each tensor input value in the Spark DataFrame must be represented as a single column containing a flattened 1-D array. The provided `input_tensor_shapes` will be used to reshape the flattened array into the expected tensor shape. For the list form, the order of the tensor shapes must match the order of the selected DataFrame columns. The batch dimension (typically -1 or None in the first dimension) should not be included, since it will be determined by the batch_size argument. Tabular datasets with scalar-valued columns should not provide this argument. Returns ------- :py:class:`UserDefinedFunctionLike` A Pandas UDF for model inference on a Spark DataFrame. Examples -------- For a pre-trained TensorFlow MNIST model with two-dimensional input images represented as a flattened tensor value stored in a single Spark DataFrame column of type `array<float>`. .. code-block:: python from pyspark.ml.functions import predict_batch_udf def make_mnist_fn(): # load/init happens once per python worker import tensorflow as tf model = tf.keras.models.load_model('/path/to/mnist_model') # predict on batches of tasks/partitions, using cached model def predict(inputs: np.ndarray) -> np.ndarray: # inputs.shape = [batch_size, 784], see input_tensor_shapes # outputs.shape = [batch_size, 10], see return_type return model.predict(inputs) return predict mnist_udf = predict_batch_udf(make_mnist_fn, return_type=ArrayType(FloatType()), batch_size=100, input_tensor_shapes=[[784]]) df = spark.read.parquet("/path/to/mnist_data") df.show(5) # +--------------------+ # | data| # +--------------------+ # |[0.0, 0.0, 0.0, 0...| # |[0.0, 0.0, 0.0, 0...| # |[0.0, 0.0, 0.0, 0...| # |[0.0, 0.0, 0.0, 0...| # |[0.0, 0.0, 0.0, 0...| # +--------------------+ df.withColumn("preds", mnist_udf("data")).show(5) # +--------------------+--------------------+ # | data| preds| # +--------------------+--------------------+ # |[0.0, 0.0, 0.0, 0...|[-13.511008, 8.84...| # |[0.0, 0.0, 0.0, 0...|[-5.3957458, -2.2...| # |[0.0, 0.0, 0.0, 0...|[-7.2014456, -8.8...| # |[0.0, 0.0, 0.0, 0...|[-19.466187, -13....| # |[0.0, 0.0, 0.0, 0...|[-5.7757926, -7.8...| # +--------------------+--------------------+ To demonstrate usage with different combinations of input and output types, the following examples just use simple mathematical transforms as the models. * Single scalar column Input DataFrame has a single scalar column, which will be passed to the `predict` function as a 1-D numpy array. >>> import numpy as np >>> import pandas as pd >>> from pyspark.ml.functions import predict_batch_udf >>> from pyspark.sql.types import FloatType >>> >>> df = spark.createDataFrame(pd.DataFrame(np.arange(100))) >>> df.show(5) +---+ | 0| +---+ | 0| | 1| | 2| | 3| | 4| +---+ only showing top 5 rows >>> def make_times_two_fn(): ... def predict(inputs: np.ndarray) -> np.ndarray: ... # inputs.shape = [batch_size] ... # outputs.shape = [batch_size] ... return inputs * 2 ... return predict ... >>> times_two_udf = predict_batch_udf(make_times_two_fn, ... return_type=FloatType(), ... batch_size=10) >>> df = spark.createDataFrame(pd.DataFrame(np.arange(100))) >>> df.withColumn("x2", times_two_udf("0")).show(5) +---+---+ | 0| x2| +---+---+ | 0|0.0| | 1|2.0| | 2|4.0| | 3|6.0| | 4|8.0| +---+---+ only showing top 5 rows * Multiple scalar columns Input DataFrame has multiple columns of scalar values. If the user-provided `predict` function expects a single input, then the user must combine the multiple columns into a single tensor using `pyspark.sql.functions.array`. >>> import numpy as np >>> import pandas as pd >>> from pyspark.ml.functions import predict_batch_udf >>> from pyspark.sql.functions import array >>> >>> data = np.arange(0, 1000, dtype=np.float64).reshape(-1, 4) >>> pdf = pd.DataFrame(data, columns=['a','b','c','d']) >>> df = spark.createDataFrame(pdf) >>> df.show(5) +----+----+----+----+ | a| b| c| d| +----+----+----+----+ | 0.0| 1.0| 2.0| 3.0| | 4.0| 5.0| 6.0| 7.0| | 8.0| 9.0|10.0|11.0| |12.0|13.0|14.0|15.0| |16.0|17.0|18.0|19.0| +----+----+----+----+ only showing top 5 rows >>> def make_sum_fn(): ... def predict(inputs: np.ndarray) -> np.ndarray: ... # inputs.shape = [batch_size, 4] ... # outputs.shape = [batch_size] ... return np.sum(inputs, axis=1) ... return predict ... >>> sum_udf = predict_batch_udf(make_sum_fn, ... return_type=FloatType(), ... batch_size=10, ... input_tensor_shapes=[[4]]) >>> df.withColumn("sum", sum_udf(array("a", "b", "c", "d"))).show(5) +----+----+----+----+----+ | a| b| c| d| sum| +----+----+----+----+----+ | 0.0| 1.0| 2.0| 3.0| 6.0| | 4.0| 5.0| 6.0| 7.0|22.0| | 8.0| 9.0|10.0|11.0|38.0| |12.0|13.0|14.0|15.0|54.0| |16.0|17.0|18.0|19.0|70.0| +----+----+----+----+----+ only showing top 5 rows If the `predict` function expects multiple inputs, then the number of selected input columns must match the number of expected inputs. >>> def make_sum_fn(): ... def predict(x1: np.ndarray, ... x2: np.ndarray, ... x3: np.ndarray, ... x4: np.ndarray) -> np.ndarray: ... # xN.shape = [batch_size] ... # outputs.shape = [batch_size] ... return x1 + x2 + x3 + x4 ... return predict ... >>> sum_udf = predict_batch_udf(make_sum_fn, ... return_type=FloatType(), ... batch_size=10) >>> df.withColumn("sum", sum_udf("a", "b", "c", "d")).show(5) +----+----+----+----+----+ | a| b| c| d| sum| +----+----+----+----+----+ | 0.0| 1.0| 2.0| 3.0| 6.0| | 4.0| 5.0| 6.0| 7.0|22.0| | 8.0| 9.0|10.0|11.0|38.0| |12.0|13.0|14.0|15.0|54.0| |16.0|17.0|18.0|19.0|70.0| +----+----+----+----+----+ only showing top 5 rows * Multiple tensor columns Input DataFrame has multiple columns, where each column is a tensor. The number of columns should match the number of expected inputs for the user-provided `predict` function. >>> import numpy as np >>> import pandas as pd >>> from pyspark.ml.functions import predict_batch_udf >>> from pyspark.sql.types import ArrayType, FloatType, StructType, StructField >>> from typing import Mapping >>> >>> data = np.arange(0, 1000, dtype=np.float64).reshape(-1, 4) >>> pdf = pd.DataFrame(data, columns=['a','b','c','d']) >>> pdf_tensor = pd.DataFrame() >>> pdf_tensor['t1'] = pdf.values.tolist() >>> pdf_tensor['t2'] = pdf.drop(columns='d').values.tolist() >>> df = spark.createDataFrame(pdf_tensor) >>> df.show(5) +--------------------+------------------+ | t1| t2| +--------------------+------------------+ |[0.0, 1.0, 2.0, 3.0]| [0.0, 1.0, 2.0]| |[4.0, 5.0, 6.0, 7.0]| [4.0, 5.0, 6.0]| |[8.0, 9.0, 10.0, ...| [8.0, 9.0, 10.0]| |[12.0, 13.0, 14.0...|[12.0, 13.0, 14.0]| |[16.0, 17.0, 18.0...|[16.0, 17.0, 18.0]| +--------------------+------------------+ only showing top 5 rows >>> def make_multi_sum_fn(): ... def predict(x1: np.ndarray, x2: np.ndarray) -> np.ndarray: ... # x1.shape = [batch_size, 4] ... # x2.shape = [batch_size, 3] ... # outputs.shape = [batch_size] ... return np.sum(x1, axis=1) + np.sum(x2, axis=1) ... return predict ... >>> multi_sum_udf = predict_batch_udf( ... make_multi_sum_fn, ... return_type=FloatType(), ... batch_size=5, ... input_tensor_shapes=[[4], [3]], ... ) >>> df.withColumn("sum", multi_sum_udf("t1", "t2")).show(5) +--------------------+------------------+-----+ | t1| t2| sum| +--------------------+------------------+-----+ |[0.0, 1.0, 2.0, 3.0]| [0.0, 1.0, 2.0]| 9.0| |[4.0, 5.0, 6.0, 7.0]| [4.0, 5.0, 6.0]| 37.0| |[8.0, 9.0, 10.0, ...| [8.0, 9.0, 10.0]| 65.0| |[12.0, 13.0, 14.0...|[12.0, 13.0, 14.0]| 93.0| |[16.0, 17.0, 18.0...|[16.0, 17.0, 18.0]|121.0| +--------------------+------------------+-----+ only showing top 5 rows * Multiple outputs Some models can provide multiple outputs. These can be returned as a dictionary of named values, which can be represented in either columnar or row-based formats. >>> def make_multi_sum_fn(): ... def predict_columnar(x1: np.ndarray, x2: np.ndarray) -> Mapping[str, np.ndarray]: ... # x1.shape = [batch_size, 4] ... # x2.shape = [batch_size, 3] ... return { ... "sum1": np.sum(x1, axis=1), ... "sum2": np.sum(x2, axis=1) ... } ... return predict_columnar ... >>> multi_sum_udf = predict_batch_udf( ... make_multi_sum_fn, ... return_type=StructType([ ... StructField("sum1", FloatType(), True), ... StructField("sum2", FloatType(), True) ... ]), ... batch_size=5, ... input_tensor_shapes=[[4], [3]], ... ) >>> df.withColumn("preds", multi_sum_udf("t1", "t2")).select("t1", "t2", "preds.*").show(5) +--------------------+------------------+----+----+ | t1| t2|sum1|sum2| +--------------------+------------------+----+----+ |[0.0, 1.0, 2.0, 3.0]| [0.0, 1.0, 2.0]| 6.0| 3.0| |[4.0, 5.0, 6.0, 7.0]| [4.0, 5.0, 6.0]|22.0|15.0| |[8.0, 9.0, 10.0, ...| [8.0, 9.0, 10.0]|38.0|27.0| |[12.0, 13.0, 14.0...|[12.0, 13.0, 14.0]|54.0|39.0| |[16.0, 17.0, 18.0...|[16.0, 17.0, 18.0]|70.0|51.0| +--------------------+------------------+----+----+ only showing top 5 rows >>> def make_multi_sum_fn(): ... def predict_row(x1: np.ndarray, x2: np.ndarray) -> list[Mapping[str, float]]: ... # x1.shape = [batch_size, 4] ... # x2.shape = [batch_size, 3] ... return [{'sum1': np.sum(x1[i]), 'sum2': np.sum(x2[i])} for i in range(len(x1))] ... return predict_row ... >>> multi_sum_udf = predict_batch_udf( ... make_multi_sum_fn, ... return_type=StructType([ ... StructField("sum1", FloatType(), True), ... StructField("sum2", FloatType(), True) ... ]), ... batch_size=5, ... input_tensor_shapes=[[4], [3]], ... ) >>> df.withColumn("sum", multi_sum_udf("t1", "t2")).select("t1", "t2", "sum.*").show(5) +--------------------+------------------+----+----+ | t1| t2|sum1|sum2| +--------------------+------------------+----+----+ |[0.0, 1.0, 2.0, 3.0]| [0.0, 1.0, 2.0]| 6.0| 3.0| |[4.0, 5.0, 6.0, 7.0]| [4.0, 5.0, 6.0]|22.0|15.0| |[8.0, 9.0, 10.0, ...| [8.0, 9.0, 10.0]|38.0|27.0| |[12.0, 13.0, 14.0...|[12.0, 13.0, 14.0]|54.0|39.0| |[16.0, 17.0, 18.0...|[16.0, 17.0, 18.0]|70.0|51.0| +--------------------+------------------+----+----+ only showing top 5 rows Note that the multiple outputs can be arrays as well. >>> def make_multi_times_two_fn(): ... def predict(x1: np.ndarray, x2: np.ndarray) -> Mapping[str, np.ndarray]: ... # x1.shape = [batch_size, 4] ... # x2.shape = [batch_size, 3] ... return {"t1x2": x1 * 2, "t2x2": x2 * 2} ... return predict ... >>> multi_times_two_udf = predict_batch_udf( ... make_multi_times_two_fn, ... return_type=StructType([ ... StructField("t1x2", ArrayType(FloatType()), True), ... StructField("t2x2", ArrayType(FloatType()), True) ... ]), ... batch_size=5, ... input_tensor_shapes=[[4], [3]], ... ) >>> df.withColumn("x2", multi_times_two_udf("t1", "t2")).select("t1", "t2", "x2.*").show(5) +--------------------+------------------+--------------------+------------------+ | t1| t2| t1x2| t2x2| +--------------------+------------------+--------------------+------------------+ |[0.0, 1.0, 2.0, 3.0]| [0.0, 1.0, 2.0]|[0.0, 2.0, 4.0, 6.0]| [0.0, 2.0, 4.0]| |[4.0, 5.0, 6.0, 7.0]| [4.0, 5.0, 6.0]|[8.0, 10.0, 12.0,...| [8.0, 10.0, 12.0]| |[8.0, 9.0, 10.0, ...| [8.0, 9.0, 10.0]|[16.0, 18.0, 20.0...|[16.0, 18.0, 20.0]| |[12.0, 13.0, 14.0...|[12.0, 13.0, 14.0]|[24.0, 26.0, 28.0...|[24.0, 26.0, 28.0]| |[16.0, 17.0, 18.0...|[16.0, 17.0, 18.0]|[32.0, 34.0, 36.0...|[32.0, 34.0, 36.0]| +--------------------+------------------+--------------------+------------------+ only showing top 5 rows """ # generate a new uuid each time this is invoked on the driver to invalidate executor-side cache. model_uuid = uuid.uuid4() def predict(data: Iterator[Union[pd.Series, pd.DataFrame]]) -> Iterator[pd.DataFrame]: # TODO: adjust return type hint when Iterator[Union[pd.Series, pd.DataFrame]] is supported from pyspark.ml.model_cache import ModelCache # get predict function (from cache or from running user-provided make_predict_fn) predict_fn = ModelCache.get(model_uuid) if not predict_fn: predict_fn = make_predict_fn() ModelCache.add(model_uuid, predict_fn) # get number of expected parameters for predict function signature = inspect.signature(predict_fn) num_expected_cols = len(signature.parameters) # convert sparse input_tensor_shapes to dense if needed input_shapes: List[List[int] | None] if isinstance(input_tensor_shapes, Mapping): input_shapes = [None] * num_expected_cols for index, shape in input_tensor_shapes.items(): input_shapes[index] = shape else: input_shapes = input_tensor_shapes # type: ignore # iterate over pandas batch, invoking predict_fn with ndarrays for pandas_batch in data: has_tuple = isinstance(pandas_batch, Tuple) # type: ignore has_tensors = _has_tensor_cols(pandas_batch) # require input_tensor_shapes for any tensor columns if has_tensors and not input_shapes: raise ValueError("Tensor columns require input_tensor_shapes") for batch in _batched(pandas_batch, batch_size): num_input_rows = len(batch) num_input_cols = len(batch.columns) if num_input_cols == num_expected_cols and num_expected_cols > 1: # input column per expected input for multiple inputs multi_inputs = _validate_and_transform_multiple_inputs( batch, input_shapes, num_input_cols ) # run model prediction function on multiple (numpy) inputs preds = predict_fn(*multi_inputs) elif num_expected_cols == 1: # one or more input columns for single expected input single_input = _validate_and_transform_single_input( batch, input_shapes, has_tensors, has_tuple ) # run model prediction function on single (numpy) inputs preds = predict_fn(single_input) else: msg = "Model expected {} inputs, but received {} columns" raise ValueError(msg.format(num_expected_cols, num_input_cols)) # return transformed predictions to Spark yield _validate_and_transform_prediction_result( preds, num_input_rows, return_type ) # type: ignore return pandas_udf(predict, return_type) # type: ignore[call-overload]
def _test() -> None: import doctest from pyspark.sql import SparkSession import pyspark.ml.functions import sys from pyspark.sql.pandas.utils import ( require_minimum_pandas_version, require_minimum_pyarrow_version, ) try: require_minimum_pandas_version() require_minimum_pyarrow_version() except Exception as e: print( f"Skipping pyspark.ml.functions doctests: {e}", file=sys.stderr, ) sys.exit(0) globs = pyspark.ml.functions.__dict__.copy() spark = SparkSession.builder.master("local[2]").appName("ml.functions tests").getOrCreate() sc = spark.sparkContext globs["sc"] = sc globs["spark"] = spark (failure_count, test_count) = doctest.testmod( pyspark.ml.functions, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE, ) spark.stop() if failure_count: sys.exit(-1) if __name__ == "__main__": _test()