SHAP insights

SHAP is an open-source method for explaining the predictions from machine learning models. (You can find more information about SHAP at its repository on GitHub: https://github.com/slundberg/shap) DataRobot supports SHAP computations for all regression and binary classification blueprints. You can compute three different insights:

  • “SHAP matrix”: Raw SHAP values for each feature column and each row.

  • “SHAP impact”: Overall importance for each feature column across all rows, based on aggregated SHAP matrix values.

  • “SHAP preview”: SHAP values for the most important features in each row, presented with the values of the features in that row.

The following example code assumes that you have a trained model object called model.

import datarobot as dr
from datarobot.insights.shap_matrix import ShapMatrix
from datarobot.insights.shap_impact import ShapImpact
from datarobot.insights.shap_preview import ShapPreview
model_id = model.id  # or model_id = 'YOUR_MODEL_ID'
# request SHAP Matrix, and wait for it to complete
result = ShapMatrix.create(entity_id=model_id)  # default source is 'validation'
# view the properties of the SHAP Matrix
print(result.columns)
>>> ['AUCGUART', 'Color', 'Make', ...
print(result.matrix)
>>> [[ 1.22604372e-02  1.98424454e-01  2.23308013e-01  ...] ... ]
# request SHAP Matrix on a different partition, and return immediately with job reference
job = ShapMatrix.compute(entity_id=model_id, source='holdout')
# wait for the job to complete
result = job.get_result_when_complete()
print(result.columns)
>>> ['AUCGUART', 'Color', 'Make', ...
print(result.matrix)
>>> [[-0.11443075 -0.01130723  0.22330801 ... ] ... ]
# request SHAP Impact; only works for training currently
job = ShapImpact.compute(entity_id=model_id, source='training', row_count=100)
result = job.get_result_when_complete()
# Impacts are listed as [feature_name, normalized_impact, unnormalized_impact]
print(result.shap_impacts)
>>> [['AUCGUART', 0.07989059458051094, 0.022147886593333888], ...]
# list all matrices computed for this model, including each partition
matrix_list = ShapMatrix.list(entity_id=model_id)
print(matrix_list)
>>> [<datarobot.insights.shap_matrix.ShapMatrix object at 0x114e52090>, ...]
print([(matrix_obj, matrix_obj.source) for matrix_obj in matrix_list])
>>> [(<datarobot.insights.shap_matrix.ShapMatrix object at 0x114e52090>, 'validation'), ... ]
# upload a file to the AI Catalog
dataset = dr.Dataset.upload("./path/to/dataset.csv")
# request explanations for that file in the "preview" format
job = ShapPreview.compute(entity_id=model_id, source='externalTestSet', external_dataset_id=dataset.id)
result = job.get_result_when_complete()
print(result.previews[0])
>>> {'row_index': 0,
>>> 'prediction_value': 0.3024851286385187,
>>>  'preview_values': [{'feature_rank': 1,
>>>    'feature_name': 'BYRNO',
>>>    'feature_value': '21973',
>>>    'shap_value': 0.22025144078391848,
>>>    'has_text_explanations': False,
>>>    'text_explanations': []},
>>> ... }

SHAP insights for custom models

You can compute SHAP insights for custom models, not just native DataRobot models. To do this, first complete the following setup:

  1. Create a custom model version with an execution environment and a training dataset; note the version ID.

  2. Register the custom model version as a registered model.

  3. Initialize the registered model for insights, using the AutomatedDocument.initialize_model_compliance method.

At this point, the model is ready for SHAP insights computation. Once these steps are completed for a given registered model version, they do not have to be repeated.

As an example, the code snippet below outlines the preparation steps and then requests a ShapMatrix computation on an external dataset via the AI Catalog. It assumes that you have a Scoring Code file, model.jar, for the custom model, which you will run using the Java drop-in execution environment, as well as a training dataset called training.csv.

import datarobot as dr
from datarobot.insights.shap_matrix import ShapMatrix

# 1: create a custom model version with an execution environment and a training dataset, and note the version id
model_args = {
    "target_type": dr.TARGET_TYPE.REGRESSION,
    "target_name": "time_in_hospital",
    "language": "java",
}
training_dataset = dr.Dataset.create_from_file(file_path="path/to/training.csv")
execution_environment = dr.ExecutionEnvironment.list(search_for="java")[0]

custom_model = dr.CustomInferenceModel.create(
    name="model.jar",
    **model_args,
)

custom_model_version = dr.CustomModelVersion.create_clean(
    custom_model_id=custom_model.id,
    base_environment_id=execution_environment.id,
    training_dataset_id=dataset.id,
    files=[("path/to/model.jar", "model.jar")],
)
custom_model_version_id = custom_model_version.id

# 2. register the custom model version as a registered model
registered_model = dr.RegisteredModelVersion.create_for_custom_model_version(
    custom_model_version_id=custom_model_version.id, name=model_name, registered_model_name=model_name
)

# 3. initialize the registered model for insights
autodocs = dr.AutomatedDocument(
    entity_id=registered_model.id,
    document_type="MODEL_COMPLIANCE",
)
autodocs.initialize_model_compliance()
assert autodocs.is_model_compliance_initialized[0]

# Add the scoring dataset to the AI catalog
scoring_dataset = dr.Dataset.create_from_file(file_path="path/to/scoring_dataset.csv")

# Request the ShapMatrix computation, and retrieve results when it finishes
job = ShapMatrix.compute(
    entity_id=custom_model_version_id,
    source='externalTestSet',
    external_dataset_id=scoring_dataset.id,
    entity_type="customModel",
)
result = job.get_result_when_complete()
print(result.columns)
>>> ['AUCGUART', 'Color', 'Make', ...
print(result.matrix)
>>> [[ 1.22604372e-02  1.98424454e-01  2.23308013e-01  ...] ... ]