Source code for sparkdl.estimators.keras_image_file_estimator

#
# Copyright 2017 Databricks, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# pylint: disable=protected-access
from __future__ import absolute_import, division, print_function

import logging
import numpy as np

from pyspark.ml import Estimator
import pyspark.ml.linalg as spla
from pyspark.ml.param import Param, Params, TypeConverters

from sparkdl.image.imageIO import imageStructToArray
from sparkdl.param import (
    keyword_only, CanLoadImage, HasKerasModel, HasKerasOptimizer, HasKerasLoss, HasOutputMode,
    HasInputCol, HasInputImageNodeName, HasLabelCol, HasOutputNodeName, HasOutputCol)
from sparkdl.transformers.keras_image import KerasImageFileTransformer
import sparkdl.utils.jvmapi as JVMAPI
import sparkdl.utils.keras_model as kmutil

__all__ = ['KerasImageFileEstimator']

logger = logging.getLogger('sparkdl')

[docs]class KerasImageFileEstimator(Estimator, HasInputCol, HasInputImageNodeName, HasOutputCol, HasOutputNodeName, HasLabelCol, HasKerasModel, HasKerasOptimizer, HasKerasLoss, CanLoadImage, HasOutputMode): """ Build a Estimator from a Keras model. First, create a model and save it to file system .. code-block:: python from keras.applications.resnet50 import ResNet50 model = ResNet50(weights=None) model.save("path_to_my_model.h5") Then, create a image loading function that reads image data from URI, preprocess them, and returns the numerical tensor. .. code-block:: python def load_image_and_process(uri): import PIL.Image from keras.applications.imagenet_utils import preprocess_input original_image = PIL.Image.open(uri).convert('RGB') resized_image = original_image.resize((224, 224), PIL.Image.ANTIALIAS) image_array = np.array(resized_image).astype(np.float32) image_tensor = preprocess_input(image_array[np.newaxis, :]) return image_tensor Assume the image URIs live in the following DataFrame. .. code-block:: python original_dataset = spark.createDataFrame([ Row(imageUri="image1_uri", imageLabel="image1_label"), Row(imageUri="image2_uri", imageLabel="image2_label"), # and more rows ... ]) stringIndexer = StringIndexer(inputCol="imageLabel", outputCol="categoryIndex") indexed_dateset = stringIndexer.fit(original_dataset).transform(original_dataset) encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec") image_dataset = encoder.transform(indexed_dateset) We can then create a Keras estimator that takes our saved model file and train it using Spark. .. code-block:: python estimator = KerasImageFileEstimator(inputCol="imageUri", outputCol="name_of_result_column", labelCol="categoryVec", imageLoader=load_image_and_process, kerasOptimizer="adam", kerasLoss="categorical_crossentropy", kerasFitParams={"epochs": 5, "batch_size": 64}, modelFile="path_to_my_model.h5") transformers = estimator.fit(image_dataset) """ @keyword_only def __init__(self, inputCol=None, inputImageNodeName=None, outputCol=None, outputNodeName=None, outputMode="vector", labelCol=None, modelFile=None, imageLoader=None, kerasOptimizer=None, kerasLoss=None, kerasFitParams=None): """ __init__(self, inputCol=None, inputImageNodeName=None, outputCol=None, outputNodeName=None, outputMode="vector", labelCol=None, modelFile=None, imageLoader=None, kerasOptimizer=None, kerasLoss=None, kerasFitParams=None) """ # NOTE(phi-dbq): currently we ignore output mode, as the actual output are the # trained models and the Transformers built from them. super(KerasImageFileEstimator, self).__init__() kwargs = self._input_kwargs self.setParams(**kwargs)
[docs] @keyword_only def setParams(self, inputCol=None, inputImageNodeName=None, outputCol=None, outputNodeName=None, outputMode="vector", labelCol=None, modelFile=None, imageLoader=None, kerasOptimizer=None, kerasLoss=None, kerasFitParams=None): """ setParams(self, inputCol=None, inputImageNodeName=None, outputCol=None, outputNodeName=None, outputMode="vector", labelCol=None, modelFile=None, imageLoader=None, kerasOptimizer=None, kerasLoss=None, kerasFitParams=None) """ kwargs = self._input_kwargs return self._set(**kwargs)
[docs] def fit(self, dataset, params=None): """ Fits a model to the input dataset with optional parameters. .. warning:: This returns the byte serialized HDF5 file for each model to the driver. If the model file is large, the driver might go out-of-memory. As we cannot assume the existence of a sufficiently large (and writable) file system, users are advised to not train too many models in a single Spark job. :param dataset: input dataset, which is an instance of :py:class:`pyspark.sql.DataFrame`. The column `inputCol` should be of type `sparkdl.image.imageIO.imgSchema`. :param params: An optional param map that overrides embedded params. If a list/tuple of param maps is given, this calls fit on each param map and returns a list of models. :return: fitted model(s). If params includes a list of param maps, the order of these models matches the order of the param maps. """ self._validateParams() if params is None: paramMaps = [dict()] elif isinstance(params, (list, tuple)): if len(params) == 0: paramMaps = [dict()] else: self._validateFitParams(params) paramMaps = params elif isinstance(params, dict): self._validateFitParams(params) paramMaps = [params] else: raise ValueError("Params must be either a param map or a list/tuple of param maps, " "but got %s." % type(params)) return self._fitInParallel(dataset, paramMaps)
def _validateParams(self): """ Check Param values so we can throw errors on the driver, rather than workers. :return: True if parameters are valid """ if not self.isDefined(self.inputCol): raise ValueError("Input column must be defined") if not self.isDefined(self.outputCol): raise ValueError("Output column must be defined") return True def _validateFitParams(self, params): """ Check if an input parameter set is valid """ if isinstance(params, (list, tuple, dict)): assert self.getInputCol() not in params, \ "params {} cannot contain input column name {}".format(params, self.getInputCol()) return True def _getNumpyFeaturesAndLabels(self, dataset): """ We assume the training data fits in memory on a single server. The input dataframe is converted to numerical image features and broadcast to all the worker nodes. """ image_uri_col = self.getInputCol() label_col = None if self.isDefined(self.labelCol) and self.getLabelCol() != "": label_col = self.getLabelCol() tmp_image_col = self._loadedImageCol() image_df = self.loadImagesInternal(dataset, image_uri_col).dropna(subset=[tmp_image_col]) # Extract features localFeatures = [] rows = image_df.collect() for row in rows: spimg = row[tmp_image_col] features = imageStructToArray(spimg) localFeatures.append(features) if not localFeatures: # NOTE(phi-dbq): pep-8 recommended against testing 0 == len(array) raise ValueError("Cannot extract any feature from dataset!") X = np.stack(localFeatures, axis=0) # Extract labels y = None if label_col is not None: label_schema = image_df.schema[label_col] label_dtype = label_schema.dataType assert isinstance(label_dtype, spla.VectorUDT), \ "must encode labels in one-hot vector format, but got {}".format(label_dtype) localLabels = [] for row in rows: try: _keras_label = row[label_col].array except ValueError: raise ValueError("Cannot extract encoded label array") localLabels.append(_keras_label) if not localLabels: raise ValueError("Failed to load any labels from dataset, but labels are required") y = np.stack(localLabels, axis=0) assert y.shape[0] == X.shape[0], \ "number of features {} != number of labels {}".format(X.shape[0], y.shape[0]) return X, y def _collectModels(self, kerasModelsBytesRDD): """ Collect Keras models on workers to MLlib Models on the driver. :param kerasModelBytesRDD: RDD of (param_map, model_bytes) tuples :param paramMaps: list of ParamMaps matching the maps in `kerasModelsRDD` :return: list of MLlib models """ transformers = [] for (param_map, model_bytes) in kerasModelsBytesRDD.collect(): model_filename = kmutil.bytes_to_h5file(model_bytes) transformers.append({ 'paramMap': param_map, 'transformer': KerasImageFileTransformer(modelFile=model_filename)}) return transformers def _fitInParallel(self, dataset, paramMaps): """ Fits len(paramMaps) models in parallel, one in each Spark task. :param paramMaps: non-empty list or tuple of ParamMaps (dict values) :return: list of fitted models, matching the order of paramMaps """ sc = JVMAPI._curr_sc() paramMapsRDD = sc.parallelize(paramMaps, numSlices=len(paramMaps)) # Extract image URI from provided dataset and create features as numpy arrays localFeatures, localLabels = self._getNumpyFeaturesAndLabels(dataset) localFeaturesBc = sc.broadcast(localFeatures) localLabelsBc = None if localLabels is None else sc.broadcast(localLabels) # Broadcast Keras model (HDF5) file content as bytes modelBytes = self._loadModelAsBytes() modelBytesBc = sc.broadcast(modelBytes) # Obtain params for this estimator instance baseParamMap = self.extractParamMap() baseParamDict = dict([(param.name, val) for param, val in baseParamMap.items()]) baseParamDictBc = sc.broadcast(baseParamDict) def _local_fit(override_param_map): """ Fit locally a model with a combination of this estimator's param, with overriding parameters provided by the input. :param override_param_map: dict, key type is MLllib Param They are meant to override the base estimator's params. :return: serialized Keras HDF5 file bytes """ # Update params params = baseParamDictBc.value override_param_dict = dict([ (param.name, val) for param, val in override_param_map.items()]) params.update(override_param_dict) # Create Keras model model = kmutil.bytes_to_model(modelBytesBc.value) model.compile(optimizer=params['kerasOptimizer'], loss=params['kerasLoss']) # Retrieve features and labels and fit Keras model features = localFeaturesBc.value labels = None if localLabelsBc is None else localLabelsBc.value _fit_params = params['kerasFitParams'] model.fit(x=features, y=labels, **_fit_params) return kmutil.model_to_bytes(model) kerasModelBytesRDD = paramMapsRDD.map(lambda paramMap: (paramMap, _local_fit(paramMap))) return self._collectModels(kerasModelBytesRDD) def _loadModelAsBytes(self): """ (usable on driver only) Load the Keras model file as a byte string. :return: str containing the model data """ with open(self.getModelFile(), mode='rb') as fin: fileContent = fin.read() return fileContent def _fit(self, dataset): # pylint: disable=unused-argument err_msgs = ["This function should not have been called", "Please contact library maintainers to file a bug"] raise NotImplementedError('\n'.join(err_msgs))