Version datasets in model training runs

You can version datasets, models, and other file objects as Artifacts in Neptune.

This guide shows how to:

  • Keep track of a dataset version in your model training runs with artifacts

  • Query the dataset version from previous runs to make sure you are training on the same dataset version

  • Group your Neptune Runs by the dataset version they were trained on

By the end of this guide, you will train a few models making sure that the same dataset was used and see the Runs for this dataset version in the Neptune UI.

See this example in Neptune

Runs grouped by dataset version in Neptune UI

Keywords: Data versioning, Data version control, Track dataset version

Before you start

Make sure you meet the following prerequisites before starting:

use artifacts you need at least 0.10.10 version of neptune client.

pip install neptune-client>=0.10.10

Step 1: Prepare a model training script

Create a training script 'train_model.py' where you:

  • Specify dataset paths for training and testing

  • Define model parameters

  • Calculate the score on the test set

snippet
full script
snippet
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
TEST_DATASET_PATH = '../datasets/tables/test.csv'
PARAMS = {'n_estimators': 5,
'max_depth':1,
'max_features':2,
}
def train_model(params, train_path, test_path):
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
TARGET_COLUMN = ['variety']
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
rf = RandomForestClassifier(**params)
rf.fit(X_train, y_train)
score = rf.score(X_test, y_test)
return score
score = train_model(**PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
full script
train_model.py
import neptune.new as neptune
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
TEST_DATASET_PATH = '../datasets/tables/test.csv'
def train_model(params, train_path, test_path):
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
'petal.width']
TARGET_COLUMN = ['variety']
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
rf = RandomForestClassifier(**params)
rf.fit(X_train, y_train)
score = rf.score(X_test, y_test)
return score
#
# Run model training and log dataset version, parameter and test score to Neptune
#
# Create a Neptune Run and start logging
run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
run["datasets/test"].track_files(TEST_DATASET_PATH)
# Log parameters
PARAMS = {'n_estimators': 3,
'max_depth': 3,
'max_features': 1,
}
run["parameters"] = PARAMS
# Caclulate and log test score
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
run["metrics/test_score"] = score
# Get Neptune Run ID of the first, baseline model training run
baseline_run_id = run['sys/id'].fetch()
print(baseline_run_id)
# Stop logging to the active Neptune Run
run.stop()
#
# Run model training with different parameters and log metadata to Neptune
#
# Create a new Neptune Run and start logging
new_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
# Query the baseline Neptune run
baseline_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS',
run=baseline_run_id,
mode="read-only")
# Fetch the dataset version of the baseline model training run
baseline_run["datasets/train"].fetch_hash()
# Check if dataset versions changed or not between the runs
new_run.wait() # force asynchronous logging operations to finish
assert baseline_run["datasets/train"].fetch_hash() == new_run[
"datasets/train"].fetch_hash()
assert baseline_run["datasets/test"].fetch_hash() == new_run[
"datasets/test"].fetch_hash()
# Select new parameters and log them to Neptune
PARAMS = {'n_estimators': 3,
'max_depth': 2,
'max_features': 3,
}
new_run["parameters"] = PARAMS
# Calculate the test score and log it to Neptune
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
new_run["metrics/test_score"] = score
# Stop logging to the active Neptune Run
new_run.stop()
baseline_run.stop()
#
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
#

Step 2: Add tracking of the dataset version

snippet
full script
snippet
import neptune.new as neptune
run = neptune.init(project="common/data-versioning",
api_token="ANONYMOUS")
full script
train_model.py
import neptune.new as neptune
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
TEST_DATASET_PATH = '../datasets/tables/test.csv'
def train_model(params, train_path, test_path):
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
'petal.width']
TARGET_COLUMN = ['variety']
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
rf = RandomForestClassifier(**params)
rf.fit(X_train, y_train)
score = rf.score(X_test, y_test)
return score
#
# Run model training and log dataset version, parameter and test score to Neptune
#
# Create a Neptune Run and start logging
run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
run["datasets/test"].track_files(TEST_DATASET_PATH)
# Log parameters
PARAMS = {'n_estimators': 3,
'max_depth': 3,
'max_features': 1,
}
run["parameters"] = PARAMS
# Caclulate and log test score
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
run["metrics/test_score"] = score
# Get Neptune Run ID of the first, baseline model training run
baseline_run_id = run['sys/id'].fetch()
print(baseline_run_id)
# Stop logging to the active Neptune Run
run.stop()
#
# Run model training with different parameters and log metadata to Neptune
#
# Create a new Neptune Run and start logging
new_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
# Query the baseline Neptune run
baseline_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS',
run=baseline_run_id,
mode="read-only")
# Fetch the dataset version of the baseline model training run
baseline_run["datasets/train"].fetch_hash()
# Check if dataset versions changed or not between the runs
new_run.wait() # force asynchronous logging operations to finish
assert baseline_run["datasets/train"].fetch_hash() == new_run[
"datasets/train"].fetch_hash()
assert baseline_run["datasets/test"].fetch_hash() == new_run[
"datasets/test"].fetch_hash()
# Select new parameters and log them to Neptune
PARAMS = {'n_estimators': 3,
'max_depth': 2,
'max_features': 3,
}
new_run["parameters"] = PARAMS
# Calculate the test score and log it to Neptune
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
new_run["metrics/test_score"] = score
# Stop logging to the active Neptune Run
new_run.stop()
baseline_run.stop()
#
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
#
snippet
full script
snippet
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
run["datasets/test"].track_files(TEST_DATASET_PATH)
full script
train_model.py
import neptune.new as neptune
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
TEST_DATASET_PATH = '../datasets/tables/test.csv'
def train_model(params, train_path, test_path):
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
'petal.width']
TARGET_COLUMN = ['variety']
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
rf = RandomForestClassifier(**params)
rf.fit(X_train, y_train)
score = rf.score(X_test, y_test)
return score
#
# Run model training and log dataset version, parameter and test score to Neptune
#
# Create a Neptune Run and start logging
run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
run["datasets/test"].track_files(TEST_DATASET_PATH)
# Log parameters
PARAMS = {'n_estimators': 3,
'max_depth': 3,
'max_features': 1,
}
run["parameters"] = PARAMS
# Caclulate and log test score
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
run["metrics/test_score"] = score
# Get Neptune Run ID of the first, baseline model training run
baseline_run_id = run['sys/id'].fetch()
print(baseline_run_id)
# Stop logging to the active Neptune Run
run.stop()
#
# Run model training with different parameters and log metadata to Neptune
#
# Create a new Neptune Run and start logging
new_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
# Query the baseline Neptune run
baseline_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS',
run=baseline_run_id,
mode="read-only")
# Fetch the dataset version of the baseline model training run
baseline_run["datasets/train"].fetch_hash()
# Check if dataset versions changed or not between the runs
new_run.wait() # force asynchronous logging operations to finish
assert baseline_run["datasets/train"].fetch_hash() == new_run[
"datasets/train"].fetch_hash()
assert baseline_run["datasets/test"].fetch_hash() == new_run[
"datasets/test"].fetch_hash()
# Select new parameters and log them to Neptune
PARAMS = {'n_estimators': 3,
'max_depth': 2,
'max_features': 3,
}
new_run["parameters"] = PARAMS
# Calculate the test score and log it to Neptune
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
new_run["metrics/test_score"] = score
# Stop logging to the active Neptune Run
new_run.stop()
baseline_run.stop()
#
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
#

You can also version the entire dataset folder by running:

run["datasets_tables"].track_files('../datasets/tables')

Step 3: Run model training and log parameters and metrics to Neptune

  • Log parameters to Neptune

snippet
full script
snippet
PARAMS = {'n_estimators': 5,
'max_depth':1,
'max_features':2,
}
run["parameters"] = PARAMS
full script
train_model.py
import neptune.new as neptune
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
TEST_DATASET_PATH = '../datasets/tables/test.csv'
def train_model(params, train_path, test_path):
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
'petal.width']
TARGET_COLUMN = ['variety']
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
rf = RandomForestClassifier(**params)
rf.fit(X_train, y_train)
score = rf.score(X_test, y_test)
return score
#
# Run model training and log dataset version, parameter and test score to Neptune
#
# Create a Neptune Run and start logging
run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
run["datasets/test"].track_files(TEST_DATASET_PATH)
# Log parameters
PARAMS = {'n_estimators': 3,
'max_depth': 3,
'max_features': 1,
}
run["parameters"] = PARAMS
# Caclulate and log test score
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
run["metrics/test_score"] = score
# Get Neptune Run ID of the first, baseline model training run
baseline_run_id = run['sys/id'].fetch()
print(baseline_run_id)
# Stop logging to the active Neptune Run
run.stop()
#
# Run model training with different parameters and log metadata to Neptune
#
# Create a new Neptune Run and start logging
new_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
# Query the baseline Neptune run
baseline_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS',
run=baseline_run_id,
mode="read-only")
# Fetch the dataset version of the baseline model training run
baseline_run["datasets/train"].fetch_hash()
# Check if dataset versions changed or not between the runs
new_run.wait() # force asynchronous logging operations to finish
assert baseline_run["datasets/train"].fetch_hash() == new_run[
"datasets/train"].fetch_hash()
assert baseline_run["datasets/test"].fetch_hash() == new_run[
"datasets/test"].fetch_hash()
# Select new parameters and log them to Neptune
PARAMS = {'n_estimators': 3,
'max_depth': 2,
'max_features': 3,
}
new_run["parameters"] = PARAMS
# Calculate the test score and log it to Neptune
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
new_run["metrics/test_score"] = score
# Stop logging to the active Neptune Run
new_run.stop()
baseline_run.stop()
#
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
#
  • Log score on the test set to Neptune:

snippet
full script
snippet
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
run["metrics/test_score"] = score
full script
train_model.py
import neptune.new as neptune
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
TEST_DATASET_PATH = '../datasets/tables/test.csv'
def train_model(params, train_path, test_path):
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
'petal.width']
TARGET_COLUMN = ['variety']
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
rf = RandomForestClassifier(**params)
rf.fit(X_train, y_train)
score = rf.score(X_test, y_test)
return score
#
# Run model training and log dataset version, parameter and test score to Neptune
#
# Create a Neptune Run and start logging
run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
run["datasets/test"].track_files(TEST_DATASET_PATH)
# Log parameters
PARAMS = {'n_estimators': 3,
'max_depth': 3,
'max_features': 1,
}
run["parameters"] = PARAMS
# Caclulate and log test score
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
run["metrics/test_score"] = score
# Get Neptune Run ID of the first, baseline model training run
baseline_run_id = run['sys/id'].fetch()
print(baseline_run_id)
# Stop logging to the active Neptune Run
run.stop()
#
# Run model training with different parameters and log metadata to Neptune
#
# Create a new Neptune Run and start logging
new_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
# Query the baseline Neptune run
baseline_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS',
run=baseline_run_id,
mode="read-only")
# Fetch the dataset version of the baseline model training run
baseline_run["datasets/train"].fetch_hash()
# Check if dataset versions changed or not between the runs
new_run.wait() # force asynchronous logging operations to finish
assert baseline_run["datasets/train"].fetch_hash() == new_run[
"datasets/train"].fetch_hash()
assert baseline_run["datasets/test"].fetch_hash() == new_run[
"datasets/test"].fetch_hash()
# Select new parameters and log them to Neptune
PARAMS = {'n_estimators': 3,
'max_depth': 2,
'max_features': 3,
}
new_run["parameters"] = PARAMS
# Calculate the test score and log it to Neptune
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
new_run["metrics/test_score"] = score
# Stop logging to the active Neptune Run
new_run.stop()
baseline_run.stop()
#
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
#
  • Get the Run ID of your model training from Neptune. This will be useful when asserting the same dataset versions on the baseline and new datasets.

snippet
full script
snippet
baseline_run_id = run['sys/id'].fetch()
print(baseline_run_id)
full script
train_model.py
import neptune.new as neptune
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
TEST_DATASET_PATH = '../datasets/tables/test.csv'
def train_model(params, train_path, test_path):
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
'petal.width']
TARGET_COLUMN = ['variety']
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
rf = RandomForestClassifier(**params)
rf.fit(X_train, y_train)
score = rf.score(X_test, y_test)
return score
#
# Run model training and log dataset version, parameter and test score to Neptune
#
# Create a Neptune Run and start logging
run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
run["datasets/test"].track_files(TEST_DATASET_PATH)
# Log parameters
PARAMS = {'n_estimators': 3,
'max_depth': 3,
'max_features': 1,
}
run["parameters"] = PARAMS
# Caclulate and log test score
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
run["metrics/test_score"] = score
# Get Neptune Run ID of the first, baseline model training run
baseline_run_id = run['sys/id'].fetch()
print(baseline_run_id)
# Stop logging to the active Neptune Run
run.stop()
#
# Run model training with different parameters and log metadata to Neptune
#
# Create a new Neptune Run and start logging
new_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
# Query the baseline Neptune run
baseline_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS',
run=baseline_run_id,
mode="read-only")
# Fetch the dataset version of the baseline model training run
baseline_run["datasets/train"].fetch_hash()
# Check if dataset versions changed or not between the runs
new_run.wait() # force asynchronous logging operations to finish
assert baseline_run["datasets/train"].fetch_hash() == new_run[
"datasets/train"].fetch_hash()
assert baseline_run["datasets/test"].fetch_hash() == new_run[
"datasets/test"].fetch_hash()
# Select new parameters and log them to Neptune
PARAMS = {'n_estimators': 3,
'max_depth': 2,
'max_features': 3,
}
new_run["parameters"] = PARAMS
# Calculate the test score and log it to Neptune
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
new_run["metrics/test_score"] = score
# Stop logging to the active Neptune Run
new_run.stop()
baseline_run.stop()
#
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
#

You should get a Run ID like this: 'DAT-25'

  • Stop logging to the current Neptune Run

snippet
full script
snippet
run.stop()
full script
train_model.py
import neptune.new as neptune
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
TEST_DATASET_PATH = '../datasets/tables/test.csv'
def train_model(params, train_path, test_path):
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
'petal.width']
TARGET_COLUMN = ['variety']
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
rf = RandomForestClassifier(**params)
rf.fit(X_train, y_train)
score = rf.score(X_test, y_test)
return score
#
# Run model training and log dataset version, parameter and test score to Neptune
#
# Create a Neptune Run and start logging
run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
run["datasets/test"].track_files(TEST_DATASET_PATH)
# Log parameters
PARAMS = {'n_estimators': 3,
'max_depth': 3,
'max_features': 1,
}
run["parameters"] = PARAMS
# Caclulate and log test score
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
run["metrics/test_score"] = score
# Get Neptune Run ID of the first, baseline model training run
baseline_run_id = run['sys/id'].fetch()
print(baseline_run_id)
# Stop logging to the active Neptune Run
run.stop()
#
# Run model training with different parameters and log metadata to Neptune
#
# Create a new Neptune Run and start logging
new_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
# Query the baseline Neptune run
baseline_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS',
run=baseline_run_id,
mode="read-only")
# Fetch the dataset version of the baseline model training run
baseline_run["datasets/train"].fetch_hash()
# Check if dataset versions changed or not between the runs
new_run.wait() # force asynchronous logging operations to finish
assert baseline_run["datasets/train"].fetch_hash() == new_run[
"datasets/train"].fetch_hash()
assert baseline_run["datasets/test"].fetch_hash() == new_run[
"datasets/test"].fetch_hash()
# Select new parameters and log them to Neptune
PARAMS = {'n_estimators': 3,
'max_depth': 2,
'max_features': 3,
}
new_run["parameters"] = PARAMS
# Calculate the test score and log it to Neptune
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
new_run["metrics/test_score"] = score
# Stop logging to the active Neptune Run
new_run.stop()
baseline_run.stop()
#
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
#
  • Run training

python train_model.py

Step 4: Add a version check for the training and testing datasets

You can fetch the dataset version hash from the baseline and compare it with the current version of the dataset.

To do that:

  • Create a new Neptune Run and track the dataset version:

snippet
full script
snippet
new_run = neptune.init(project="common/data-versioning",
api_token="ANONYMOUS")
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
full script
train_model.py
import neptune.new as neptune
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
TEST_DATASET_PATH = '../datasets/tables/test.csv'
def train_model(params, train_path, test_path):
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
'petal.width']
TARGET_COLUMN = ['variety']
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
rf = RandomForestClassifier(**params)
rf.fit(X_train, y_train)
score = rf.score(X_test, y_test)
return score
#
# Run model training and log dataset version, parameter and test score to Neptune
#
# Create a Neptune Run and start logging
run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
run["datasets/test"].track_files(TEST_DATASET_PATH)
# Log parameters
PARAMS = {'n_estimators': 3,
'max_depth': 3,
'max_features': 1,
}
run["parameters"] = PARAMS
# Caclulate and log test score
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
run["metrics/test_score"] = score
# Get Neptune Run ID of the first, baseline model training run
baseline_run_id = run['sys/id'].fetch()
print(baseline_run_id)
# Stop logging to the active Neptune Run
run.stop()
#
# Run model training with different parameters and log metadata to Neptune
#
# Create a new Neptune Run and start logging
new_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
# Query the baseline Neptune run
baseline_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS',
run=baseline_run_id,
mode="read-only")
# Fetch the dataset version of the baseline model training run
baseline_run["datasets/train"].fetch_hash()
# Check if dataset versions changed or not between the runs
new_run.wait() # force asynchronous logging operations to finish
assert baseline_run["datasets/train"].fetch_hash() == new_run[
"datasets/train"].fetch_hash()
assert baseline_run["datasets/test"].fetch_hash() == new_run[
"datasets/test"].fetch_hash()
# Select new parameters and log them to Neptune
PARAMS = {'n_estimators': 3,
'max_depth': 2,
'max_features': 3,
}
new_run["parameters"] = PARAMS
# Calculate the test score and log it to Neptune
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
new_run["metrics/test_score"] = score
# Stop logging to the active Neptune Run
new_run.stop()
baseline_run.stop()
#
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
#
  • Get the Neptune Run object for the baseline model:

snippet
full script
snippet
baseline_run = neptune.init(project="common/data-versioning",
api_token="ANONYMOUS",
run=baseline_run_id,
mode="read-only")
full script
train_model.py
import neptune.new as neptune
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
TEST_DATASET_PATH = '../datasets/tables/test.csv'
def train_model(params, train_path, test_path):
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
'petal.width']
TARGET_COLUMN = ['variety']
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
rf = RandomForestClassifier(**params)
rf.fit(X_train, y_train)
score = rf.score(X_test, y_test)
return score
#
# Run model training and log dataset version, parameter and test score to Neptune
#
# Create a Neptune Run and start logging
run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
run["datasets/test"].track_files(TEST_DATASET_PATH)
# Log parameters
PARAMS = {'n_estimators': 3,
'max_depth': 3,
'max_features': 1,
}
run["parameters"] = PARAMS
# Caclulate and log test score
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
run["metrics/test_score"] = score
# Get Neptune Run ID of the first, baseline model training run
baseline_run_id = run['sys/id'].fetch()
print(baseline_run_id)
# Stop logging to the active Neptune Run
run.stop()
#
# Run model training with different parameters and log metadata to Neptune
#
# Create a new Neptune Run and start logging
new_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
# Query the baseline Neptune run
baseline_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS',
run=baseline_run_id,
mode="read-only")
# Fetch the dataset version of the baseline model training run
baseline_run["datasets/train"].fetch_hash()
# Check if dataset versions changed or not between the runs
new_run.wait() # force asynchronous logging operations to finish
assert baseline_run["datasets/train"].fetch_hash() == new_run[
"datasets/train"].fetch_hash()
assert baseline_run["datasets/test"].fetch_hash() == new_run[
"datasets/test"].fetch_hash()
# Select new parameters and log them to Neptune
PARAMS = {'n_estimators': 3,
'max_depth': 2,
'max_features': 3,
}
new_run["parameters"] = PARAMS
# Calculate the test score and log it to Neptune
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
new_run["metrics/test_score"] = score
# Stop logging to the active Neptune Run
new_run.stop()
baseline_run.stop()
#
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
#
snippet
full script
snippet
baseline_run["datasets/train"].fetch_hash()
full script
train_model.py
import neptune.new as neptune
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
TEST_DATASET_PATH = '../datasets/tables/test.csv'
def train_model(params, train_path, test_path):
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
'petal.width']
TARGET_COLUMN = ['variety']
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
rf = RandomForestClassifier(**params)
rf.fit(X_train, y_train)
score = rf.score(X_test, y_test)
return score
#
# Run model training and log dataset version, parameter and test score to Neptune
#
# Create a Neptune Run and start logging
run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
run["datasets/test"].track_files(TEST_DATASET_PATH)
# Log parameters
PARAMS = {'n_estimators': 3,
'max_depth': 3,
'max_features': 1,
}
run["parameters"] = PARAMS
# Caclulate and log test score
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
run["metrics/test_score"] = score
# Get Neptune Run ID of the first, baseline model training run
baseline_run_id = run['sys/id'].fetch()
print(baseline_run_id)
# Stop logging to the active Neptune Run
run.stop()
#
# Run model training with different parameters and log metadata to Neptune
#
# Create a new Neptune Run and start logging
new_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
# Query the baseline Neptune run
baseline_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS',
run=baseline_run_id,
mode="read-only")
# Fetch the dataset version of the baseline model training run
baseline_run["datasets/train"].fetch_hash()
# Check if dataset versions changed or not between the runs
new_run.wait() # force asynchronous logging operations to finish
assert baseline_run["datasets/train"].fetch_hash() == new_run[
"datasets/train"].fetch_hash()
assert baseline_run["datasets/test"].fetch_hash() == new_run[
"datasets/test"].fetch_hash()
# Select new parameters and log them to Neptune
PARAMS = {'n_estimators': 3,
'max_depth': 2,
'max_features': 3,
}
new_run["parameters"] = PARAMS
# Calculate the test score and log it to Neptune
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
new_run["metrics/test_score"] = score
# Stop logging to the active Neptune Run
new_run.stop()
baseline_run.stop()
#
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
#
  • Compare the current dataset version with the baseline version:

snippet
full script
snippet
new_run.wait()
assert baseline_run["datasets/train"].fetch_hash() == new_run["datasets/train"].fetch_hash()
assert baseline_run["datasets/test"].fetch_hash() == new_run["datasets/test"].fetch_hash()
full script
train_model.py
import neptune.new as neptune
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
TEST_DATASET_PATH = '../datasets/tables/test.csv'
def train_model(params, train_path, test_path):
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
'petal.width']
TARGET_COLUMN = ['variety']
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
rf = RandomForestClassifier(**params)
rf.fit(X_train, y_train)
score = rf.score(X_test, y_test)
return score
#
# Run model training and log dataset version, parameter and test score to Neptune
#
# Create a Neptune Run and start logging
run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
run["datasets/test"].track_files(TEST_DATASET_PATH)
# Log parameters
PARAMS = {'n_estimators': 3,
'max_depth': 3,
'max_features': 1,
}
run["parameters"] = PARAMS
# Caclulate and log test score
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
run["metrics/test_score"] = score
# Get Neptune Run ID of the first, baseline model training run
baseline_run_id = run['sys/id'].fetch()
print(baseline_run_id)
# Stop logging to the active Neptune Run
run.stop()
#
# Run model training with different parameters and log metadata to Neptune
#
# Create a new Neptune Run and start logging
new_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
# Query the baseline Neptune run
baseline_run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS',
run=baseline_run_id,
mode="read-only")
# Fetch the dataset version of the baseline model training run
baseline_run["datasets/train"].fetch_hash()
# Check if dataset versions changed or not between the runs
new_run.wait() # force asynchronous logging operations to finish
assert baseline_run["datasets/train"].fetch_hash() == new_run[
"datasets/train"].fetch_hash()
assert baseline_run["datasets/test"].fetch_hash() == new_run[
"datasets/test"].fetch_hash()
# Select new parameters and log them to Neptune
PARAMS = {'n_estimators': 3,
'max_depth': 2,
'max_features': 3,
}
new_run["parameters"] = PARAMS
# Calculate the test score and log it to Neptune
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
new_run["metrics/test_score"] = score
# Stop logging to the active Neptune Run
new_run.stop()
baseline_run.stop()
#
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
#

To force all asynchronous logging operations to finish use run.wait() method.

Step 5: Run model training with new parameters

  • Change the parameters and run model training again

snippet
full script
snippet
PARAMS = {'n_estimators': 10,
'max_depth':3,
'max_features':2,
}
new_run["parameters"] = PARAMS
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
new_run["metrics/test_score"] = score
full script
train_model.py
import neptune.new as neptune
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
TEST_DATASET_PATH = '../datasets/tables/test.csv'
def train_model(params, train_path, test_path):
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
'petal.width']
TARGET_COLUMN = ['variety']
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
rf = RandomForestClassifier(**params)
rf.fit(X_train, y_train)
score = rf.score(X_test, y_test)
return score
#
# Run model training and log dataset version, parameter and test score to Neptune
#
# Create a Neptune Run and start logging
run = neptune.init(project='common/data-versioning',
api_token='ANONYMOUS')
# Track dataset version
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
run["datasets/test"].track_files(TEST_DATASET_PATH)
# Log parameters
PARAMS = {'n_estimators': 3,
'max_depth': 3,
'max_features': 1,
}
run["parameters"] = PARAMS
# Caclulate and log test score
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
run["metrics/test_score"] = score
# Get Neptune Run ID of the first, baseline model training run
baseline_run_id = run[