Kedro integration guide: Display node metadata and outputs#
You can compare metrics, parameters, dataset versions, and other metadata from Kedro pipeline nodes.
This guide shows how to:
- Log diagnostic charts from Kedro pipeline nodes.
- Save node output as a JSON file.
- Display charts, outputs, and other metadata from Kedro pipelines.
See dashboard in Neptune  Code examples 
Before you start#
- Sign up at neptune.ai/register.
-
Create a project for storing your metadata.
-
Have the Kedro–Neptune plugin configured and initialized according to the Setup guide.
Preparing the training runs#
Setting up the scripts#
In this section, we'll set up the Kedro nodes and add Neptune logging to the code.
-
To log the model training parameters to Neptune automatically, define them in the
conf/base/parameters.yml
file: -
Create a model training node in the
src/KEDRO_PROJECT/nodes.py
file.Use the parameters you defined in the
conf/base/parameters.yml
file. The node should output a trained model.nodes.pyfrom sklearn.ensemble import RandomForestClassifier ... def train_rf_model( train_x: pd.DataFrame, train_y: pd.DataFrame, parameters: Dict[str, Any] ): max_depth = parameters["rf_max_depth"] n_estimators = parameters["rf_n_estimators"] max_features = parameters["rf_max_features"] clf = RandomForestClassifier( max_depth=max_depth, n_estimators=n_estimators, max_features=max_features, ) clf.fit(train_x, train_y) return clf
nodes.py""" This is a boilerplate pipeline generated using Kedro 0.18.4 """ from typing import Any, Dict, Tuple import matplotlib.pyplot as plt import neptune import numpy as np import pandas as pd from scikitplot.metrics import plot_precision_recall, plot_roc from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.neural_network import MLPClassifier def split_data( data: pd.DataFrame, parameters: Dict[str, Any] ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: """Splits data into features and target training and test sets. Args: data: Data containing features and target. parameters: Parameters defined in parameters.yml. Returns: Split data. """ data_train = data.sample( frac=parameters["train_fraction"], random_state=parameters["random_state"] ) data_test = data.drop(data_train.index) X_train = data_train.drop(columns=parameters["target_column"]) X_test = data_test.drop(columns=parameters["target_column"]) y_train = data_train[parameters["target_column"]] y_test = data_test[parameters["target_column"]] return X_train, X_test, y_train, y_test def train_rf_model( train_x: pd.DataFrame, train_y: pd.DataFrame, parameters: Dict[str, Any] ): max_depth = parameters["rf_max_depth"] n_estimators = parameters["rf_n_estimators"] max_features = parameters["rf_max_features"] clf = RandomForestClassifier( max_depth=max_depth, n_estimators=n_estimators, max_features=max_features, ) clf.fit(train_x, train_y) return clf def train_mlp_model( train_x: pd.DataFrame, train_y: pd.DataFrame, parameters: Dict[str, Any] ): """Node for training MLP model""" alpha = parameters["mlp_alpha"] max_iter = parameters["mlp_max_iter"] clf = MLPClassifier(alpha=alpha, max_iter=max_iter) clf.fit(train_x, train_y) return clf def get_predictions( rf_model: RandomForestClassifier, mlp_model: MLPClassifier, test_x: pd.DataFrame ) -> Dict[str, Any]: """Node for making predictions given a pre-trained model and a test set.""" predictions = {} for name, model in zip(["rf", "mlp"], [rf_model, mlp_model]): y_pred = model.predict_proba(test_x).tolist() predictions[name] = y_pred return predictions def evaluate_models( predictions: dict, test_y: pd.DataFrame, neptune_run: neptune.handler.Handler ): """Node for - evaluating Random Forest and MLP models - creating ROC and Precision-Recall Curves """ for name, y_pred_proba in predictions.items(): y_true = test_y.to_numpy() y_pred_proba = np.array(y_pred_proba) y_pred = np.argmax(y_pred_proba, axis=1) y_pred = np.where( y_pred == 0, "setosa", np.where(y_pred == 1, "versicolor", "virginica"), ) accuracy = accuracy_score(y_true, y_pred) neptune_run[f"nodes/evaluate_models/metrics/accuracy_{name}"] = accuracy
Note
In this example, we create a Kedro pipeline that trains and ensembles predictions from two models: Random Forest and MLPClassifier.
For simplicity, we only show the Random Forest code snippets. See the full
nodes.py
for the MLPClassifier. -
Create a model prediction node in the
src/KEDRO_PROJECT/nodes.py
file.This node should output a dictionary with predictions for two models: Random Forest and MLPClassifier.
nodes.pydef get_predictions( rf_model: RandomForestClassifier, mlp_model: MLPClassifier, test_x: pd.DataFrame ) -> Dict[str, Any]: """Node for making predictions given a pre-trained model and a test set.""" predictions = {} for name, model in zip(["rf", "mlp"], [rf_model, mlp_model]): y_pred = model.predict_proba(test_x).tolist() predictions[name] = y_pred return predictions
nodes.py""" This is a boilerplate pipeline generated using Kedro 0.18.4 """ from typing import Any, Dict, Tuple import matplotlib.pyplot as plt import neptune import numpy as np import pandas as pd from scikitplot.metrics import plot_precision_recall, plot_roc from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.neural_network import MLPClassifier def split_data( data: pd.DataFrame, parameters: Dict[str, Any] ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: """Splits data into features and target training and test sets. Args: data: Data containing features and target. parameters: Parameters defined in parameters.yml. Returns: Split data. """ data_train = data.sample( frac=parameters["train_fraction"], random_state=parameters["random_state"] ) data_test = data.drop(data_train.index) X_train = data_train.drop(columns=parameters["target_column"]) X_test = data_test.drop(columns=parameters["target_column"]) y_train = data_train[parameters["target_column"]] y_test = data_test[parameters["target_column"]] return X_train, X_test, y_train, y_test def train_rf_model( train_x: pd.DataFrame, train_y: pd.DataFrame, parameters: Dict[str, Any] ): max_depth = parameters["rf_max_depth"] n_estimators = parameters["rf_n_estimators"] max_features = parameters["rf_max_features"] clf = RandomForestClassifier( max_depth=max_depth, n_estimators=n_estimators, max_features=max_features, ) clf.fit(train_x, train_y) return clf def train_mlp_model( train_x: pd.DataFrame, train_y: pd.DataFrame, parameters: Dict[str, Any] ): """Node for training MLP model""" alpha = parameters["mlp_alpha"] max_iter = parameters["mlp_max_iter"] clf = MLPClassifier(alpha=alpha, max_iter=max_iter) clf.fit(train_x, train_y) return clf def get_predictions( rf_model: RandomForestClassifier, mlp_model: MLPClassifier, test_x: pd.DataFrame ) -> Dict[str, Any]: """Node for making predictions given a pre-trained model and a test set.""" predictions = {} for name, model in zip(["rf", "mlp"], [rf_model, mlp_model]): y_pred = model.predict_proba(test_x).tolist() predictions[name] = y_pred return predictions def evaluate_models( predictions: dict, test_y: pd.DataFrame, neptune_run: neptune.handler.Handler ): """Node for - evaluating Random Forest and MLP models - creating ROC and Precision-Recall Curves """ for name, y_pred_proba in predictions.items(): y_true = test_y.to_numpy() y_pred_proba = np.array(y_pred_proba) y_pred = np.argmax(y_pred_proba, axis=1) y_pred = np.where( y_pred == 0, "setosa", np.where(y_pred == 1, "versicolor", "virginica"), ) accuracy = accuracy_score(y_true, y_pred) neptune_run[f"nodes/evaluate_models/metrics/accuracy_{name}"] = accuracy
-
Import Neptune towards the top of the
nodes.py
file:nodes.py""" This is a boilerplate pipeline generated using Kedro 0.18.4 """ from typing import Any, Dict, Tuple import matplotlib.pyplot as plt import neptune import numpy as np import pandas as pd from scikitplot.metrics import plot_precision_recall, plot_roc from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.neural_network import MLPClassifier def split_data( ...
-
Create a model evaluation node in the
src/KEDRO_PROJECT/nodes.py
file.nodes.pydef evaluate_models( predictions: dict, test_y: pd.DataFrame, neptune_run: neptune.handler.Handler): """Node for - evaluating Random Forest and MLP models - creating ROC and Precision-Recall Curves """ ...
Tip
You can treat
neptune_run
like a normal Neptune run and log metadata to it as you normally would.You must use the special string
neptune_run
as the run handler in Kedro pipelines. -
Create the ROC and precision-recall curves as Matplotlib figures and use the
append()
method to log them to thenodes/evaluate_models/plots/plot_roc_curve
andnodes/evaluate_models/plots/plot_precision_recall_curve
namespaces, respectively.nodes.pyfrom scikitplot.metrics import plot_precision_recall, plot_roc ... def evaluate_models( predictions: dict, test_y: pd.DataFrame, neptune_run: neptune.handler.Handler ): """Node for - evaluating Random Forest and MLP models - creating ROC and Precision-Recall Curves """ for name, y_pred_proba in predictions.items(): y_true = test_y.to_numpy() y_pred_proba = np.array(y_pred_proba) y_pred = np.argmax(y_pred_proba, axis=1) y_pred = np.where( y_pred == 0, "setosa", np.where(y_pred == 1, "versicolor", "virginica"), ) fig, ax = plt.subplots() plot_roc(test_y, y_pred_proba, ax=ax, title=f"ROC curve {name}") neptune_run["nodes/evaluate_models/plots/plot_roc_curve"].append(fig) fig, ax = plt.subplots() plot_precision_recall(test_y, y_pred_proba, ax=ax, title=f"PR curve {name}") neptune_run["nodes/evaluate_models/plots/plot_precision_recall_curve"].append(fig)
nodes.py""" This is a boilerplate pipeline generated using Kedro 0.18.4 """ from typing import Any, Dict, Tuple import matplotlib.pyplot as plt import neptune import numpy as np import pandas as pd from scikitplot.metrics import plot_precision_recall, plot_roc from sklearn.ensemble import RandomForestClassifier from sklearn.neural_network import MLPClassifier def split_data( data: pd.DataFrame, parameters: Dict[str, Any] ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: """Splits data into features and target training and test sets. Args: data: Data containing features and target. parameters: Parameters defined in parameters.yml. Returns: Split data. """ data_train = data.sample( frac=parameters["train_fraction"], random_state=parameters["random_state"] ) data_test = data.drop(data_train.index) X_train = data_train.drop(columns=parameters["target_column"]) X_test = data_test.drop(columns=parameters["target_column"]) y_train = data_train[parameters["target_column"]] y_test = data_test[parameters["target_column"]] return X_train, X_test, y_train, y_test def train_rf_model( train_x: pd.DataFrame, train_y: pd.DataFrame, parameters: Dict[str, Any] ): max_depth = parameters["rf_max_depth"] n_estimators = parameters["rf_n_estimators"] max_features = parameters["rf_max_features"] clf = RandomForestClassifier( max_depth=max_depth, n_estimators=n_estimators, max_features=max_features, ) clf.fit(train_x, train_y) return clf def train_mlp_model( train_x: pd.DataFrame, train_y: pd.DataFrame, parameters: Dict[str, Any] ): """Node for training MLP model""" alpha = parameters["mlp_alpha"] max_iter = parameters["mlp_max_iter"] clf = MLPClassifier(alpha=alpha, max_iter=max_iter) clf.fit(train_x, train_y) return clf def get_predictions( rf_model: RandomForestClassifier, mlp_model: MLPClassifier, test_x: pd.DataFrame ) -> Dict[str, Any]: """Node for making predictions given a pre-trained model and a test set.""" predictions = {} for name, model in zip(["rf", "mlp"], [rf_model, mlp_model]): y_pred = model.predict_proba(test_x).tolist() predictions[name] = y_pred return predictions def evaluate_models( predictions: dict, test_y: pd.DataFrame, neptune_run: neptune.handler.Handler ): """Node for - evaluating Random Forest and MLP models - creating ROC and Precision-Recall Curves """ for name, y_pred_proba in predictions.items(): y_true = test_y.to_numpy() y_pred_proba = np.array(y_pred_proba) y_pred = np.argmax(y_pred_proba, axis=1) y_pred = np.where( y_pred == 0, "setosa", np.where(y_pred == 1, "versicolor", "virginica"), ) fig, ax = plt.subplots() plot_roc(test_y, y_pred_proba, ax=ax, title=f"ROC curve {name}") neptune_run["nodes/evaluate_models/plots/plot_roc_curve"].append(fig) fig, ax = plt.subplots() plot_precision_recall(test_y, y_pred_proba, ax=ax, title=f"PR curve {name}") neptune_run["nodes/evaluate_models/plots/plot_precision_recall_curve"].append(fig)
Related
-
Add the predictions dataset to the Kedro catalog in the
conf/base/catalog.yml
file:# Here you can define all your data sets by using simple YAML syntax. # # Documentation for this file format can be found in "The Data Catalog" # Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html # # We support interacting with a variety of data stores including local file systems, cloud, network and HDFS # # An example data set definition can look as follows: # #bikes: # type: pandas.CSVDataSet # filepath: "data/01_raw/bikes.csv" # #weather: # type: spark.SparkDataSet # filepath: s3a://your_bucket/data/01_raw/weather* # file_format: csv # credentials: dev_s3 # load_args: # header: True # inferSchema: True # save_args: # sep: '|' # header: True # #scooters: # type: pandas.SQLTableDataSet # credentials: scooters_credentials # table_name: scooters # load_args: # index_col: ['name'] # columns: ['name', 'gear'] # save_args: # if_exists: 'replace' # # if_exists: 'fail' # # if_exists: 'append' # # The Data Catalog supports being able to reference the same file using two different DataSet implementations # (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here: # https://kedro.readthedocs.io/en/stable/data/data_catalog.html # # This is a data set used by the "Hello World" example pipeline provided with the project # template. Please feel free to remove it once you remove the example pipeline. example_iris_data: type: pandas.CSVDataSet filepath: data/01_raw/iris.csv predictions: type: kedro.extras.datasets.json.JSONDataSet filepath: data/07_model_output/predictions.json predictions@neptune: type: kedro_neptune.NeptuneFileDataSet filepath: data/07_model_output/predictions.json
-
To log the Kedro DataSet as a file to Neptune, add the
predictions@neptune
dataset to the catalog in theconf/base/catalog.yml
file:# Here you can define all your data sets by using simple YAML syntax. # # Documentation for this file format can be found in "The Data Catalog" # Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html # # We support interacting with a variety of data stores including local file systems, cloud, network and HDFS # # An example data set definition can look as follows: # #bikes: # type: pandas.CSVDataSet # filepath: "data/01_raw/bikes.csv" # #weather: # type: spark.SparkDataSet # filepath: s3a://your_bucket/data/01_raw/weather* # file_format: csv # credentials: dev_s3 # load_args: # header: True # inferSchema: True # save_args: # sep: '|' # header: True # #scooters: # type: pandas.SQLTableDataSet # credentials: scooters_credentials # table_name: scooters # load_args: # index_col: ['name'] # columns: ['name', 'gear'] # save_args: # if_exists: 'replace' # # if_exists: 'fail' # # if_exists: 'append' # # The Data Catalog supports being able to reference the same file using two different DataSet implementations # (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here: # https://kedro.readthedocs.io/en/stable/data/data_catalog.html # # This is a data set used by the "Hello World" example pipeline provided with the project # template. Please feel free to remove it once you remove the example pipeline. example_iris_data: type: pandas.CSVDataSet filepath: data/01_raw/iris.csv predictions: type: kedro.extras.datasets.json.JSONDataSet filepath: data/07_model_output/predictions.json predictions@neptune: type: kedro_neptune.NeptuneFileDataSet filepath: data/07_model_output/predictions.json
Adding the run to the pipeline#
Next, we'll add the Neptune run handler to the Kedro pipeline.
- Go to a pipeline definition, such as
src/KEDRO_PROJECT/pipeline.py
. - Add nodes to train the RF and MLP models, get predictions, and evaluate the models. Add "neptune_run" as an input to the
evaluate_models
node.
from .nodes import (
...,
evaluate_models,
get_predictions,
train_mlp_model,
train_rf_model,
)
...
node(
func=train_rf_model,
inputs=["X_train", "y_train", "parameters"],
outputs="rf_model",
name="train_rf_model",
),
node(
func=train_mlp_model,
inputs=["X_train", "y_train", "parameters"],
outputs="mlp_model",
name="train_mlp_model",
),
node(
func=get_predictions,
inputs=["rf_model", "mlp_model", "X_test"],
outputs="predictions",
name="get_predictions",
),
node(
func=evaluate_models,
inputs=["predictions", "y_test", "neptune_run"],
outputs=None,
name="evaluate_models",
),
...
"""
This is a boilerplate pipeline
generated using Kedro 0.18.4
"""
from kedro.pipeline import Pipeline, node, pipeline
from .nodes import (
evaluate_models,
get_predictions,
split_data,
train_mlp_model,
train_rf_model,
)
def create_pipeline(**kwargs) -> Pipeline:
return pipeline(
[
node(
func=split_data,
inputs=["example_iris_data", "parameters"],
outputs=["X_train", "X_test", "y_train", "y_test"],
name="split",
),
node(
func=train_rf_model,
inputs=["X_train", "y_train", "parameters"],
outputs="rf_model",
name="train_rf_model",
),
node(
func=train_mlp_model,
inputs=["X_train", "y_train", "parameters"],
outputs="mlp_model",
name="train_mlp_model",
),
node(
func=get_predictions,
inputs=["rf_model", "mlp_model", "X_test"],
outputs="predictions",
name="get_predictions",
),
node(
func=evaluate_models,
inputs=["predictions", "y_test", "neptune_run"],
outputs=None,
name="evaluate_models",
),
]
)
-
On the command line, execute your Kedro pipeline:
-
To open the run in Neptune, click the Neptune link that appears in the console output.
Example link: https://app.neptune.ai/o/common/org/kedro-integration/e/KED-1564/metadata
Displaying node output#
- To display your ROC curves, in All metadata, navigate to the
kedro/nodes/evaluate_models/plots/plot_roc_curve
namespace. - Click on one of the ROC curves to enlarge and scroll through the images.
- To preview your JSON file, navigate to
kedro/catalog/files
. -
To combine the above in one view, create a custom dashboard.