Version datasets in model training runs
How to track versions of datasets, models, and other artifacts in Neptune
You can version datasets, models, and other file objects as artifacts in Neptune.
This guide shows how to:
  • Keep track of a dataset version in your model training runs with artifacts
  • Query the dataset version from previous runs to make sure you are training on the same dataset version
  • Group your Neptune Runs by the dataset version they were trained on
We'll train a few models, making sure that the same dataset was used for each model, and see the runs for this dataset version in the Neptune app.
Runs grouped by dataset version in Neptune UI

Before you start

Make sure have the following installed:

Step 1: Prepare a model training script

Create a training script train_model.py where you:
  • Specify dataset paths for training and testing
  • Define model parameters
  • Calculate the score on the test set
snippet
full script
1
import pandas as pd
2
from sklearn.ensemble import RandomForestClassifier
3
4
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
5
TEST_DATASET_PATH = '../datasets/tables/test.csv'
6
PARAMS = {'n_estimators': 5,
7
'max_depth':1,
8
'max_features':2,
9
}
10
11
def train_model(params, train_path, test_path):
12
train = pd.read_csv(train_path)
13
test = pd.read_csv(test_path)
14
15
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
16
TARGET_COLUMN = ['variety']
17
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
18
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
19
20
rf = RandomForestClassifier(**params)
21
rf.fit(X_train, y_train)
22
23
score = rf.score(X_test, y_test)
24
return score
25
26
score = train_model(**PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
9
def train_model(params, train_path, test_path):
10
train = pd.read_csv(train_path)
11
test = pd.read_csv(test_path)
12
13
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
14
'petal.width']
15
TARGET_COLUMN = ['variety']
16
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
17
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
18
19
rf = RandomForestClassifier(**params)
20
rf.fit(X_train, y_train)
21
22
score = rf.score(X_test, y_test)
23
return score
24
25
26
#
27
# Run model training and log dataset version, parameter and test score to Neptune
28
#
29
30
# Create a Neptune Run and start logging
31
run = neptune.init(project='common/data-versioning',
32
api_token='ANONYMOUS')
33
34
# Track dataset version
35
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
36
run["datasets/test"].track_files(TEST_DATASET_PATH)
37
38
# Log parameters
39
PARAMS = {'n_estimators': 3,
40
'max_depth': 3,
41
'max_features': 1,
42
}
43
run["parameters"] = PARAMS
44
45
# Caclulate and log test score
46
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
47
run["metrics/test_score"] = score
48
49
# Get Neptune Run ID of the first, baseline model training run
50
baseline_run_id = run['sys/id'].fetch()
51
print(baseline_run_id)
52
53
# Stop logging to the active Neptune Run
54
run.stop()
55
56
#
57
# Run model training with different parameters and log metadata to Neptune
58
#
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Track dataset version
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Query the baseline Neptune run
69
baseline_run = neptune.init(project='common/data-versioning',
70
api_token='ANONYMOUS',
71
run=baseline_run_id,
72
mode="read-only")
73
74
# Fetch the dataset version of the baseline model training run
75
baseline_run["datasets/train"].fetch_hash()
76
77
# Check if dataset versions changed or not between the runs
78
new_run.wait() # force asynchronous logging operations to finish
79
assert baseline_run["datasets/train"].fetch_hash() == new_run[
80
"datasets/train"].fetch_hash()
81
assert baseline_run["datasets/test"].fetch_hash() == new_run[
82
"datasets/test"].fetch_hash()
83
84
# Select new parameters and log them to Neptune
85
PARAMS = {'n_estimators': 3,
86
'max_depth': 2,
87
'max_features': 3,
88
}
89
new_run["parameters"] = PARAMS
90
91
# Calculate the test score and log it to Neptune
92
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
93
new_run["metrics/test_score"] = score
94
95
# Stop logging to the active Neptune Run
96
new_run.stop()
97
baseline_run.stop()
98
#
99
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
100
#
Copied!

Step 2: Add tracking of the dataset version

  • Create a Neptune run:
snippet
full script
1
import neptune.new as neptune
2
3
run = neptune.init(project="common/data-versioning",
4
api_token="ANONYMOUS")
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
9
def train_model(params, train_path, test_path):
10
train = pd.read_csv(train_path)
11
test = pd.read_csv(test_path)
12
13
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
14
'petal.width']
15
TARGET_COLUMN = ['variety']
16
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
17
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
18
19
rf = RandomForestClassifier(**params)
20
rf.fit(X_train, y_train)
21
22
score = rf.score(X_test, y_test)
23
return score
24
25
26
#
27
# Run model training and log dataset version, parameter and test score to Neptune
28
#
29
30
# Create a Neptune Run and start logging
31
run = neptune.init(project='common/data-versioning',
32
api_token='ANONYMOUS')
33
34
# Track dataset version
35
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
36
run["datasets/test"].track_files(TEST_DATASET_PATH)
37
38
# Log parameters
39
PARAMS = {'n_estimators': 3,
40
'max_depth': 3,
41
'max_features': 1,
42
}
43
run["parameters"] = PARAMS
44
45
# Caclulate and log test score
46
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
47
run["metrics/test_score"] = score
48
49
# Get Neptune Run ID of the first, baseline model training run
50
baseline_run_id = run['sys/id'].fetch()
51
print(baseline_run_id)
52
53
# Stop logging to the active Neptune Run
54
run.stop()
55
56
#
57
# Run model training with different parameters and log metadata to Neptune
58
#
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Track dataset version
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Query the baseline Neptune run
69
baseline_run = neptune.init(project='common/data-versioning',
70
api_token='ANONYMOUS',
71
run=baseline_run_id,
72
mode="read-only")
73
74
# Fetch the dataset version of the baseline model training run
75
baseline_run["datasets/train"].fetch_hash()
76
77
# Check if dataset versions changed or not between the runs
78
new_run.wait() # force asynchronous logging operations to finish
79
assert baseline_run["datasets/train"].fetch_hash() == new_run[
80
"datasets/train"].fetch_hash()
81
assert baseline_run["datasets/test"].fetch_hash() == new_run[
82
"datasets/test"].fetch_hash()
83
84
# Select new parameters and log them to Neptune
85
PARAMS = {'n_estimators': 3,
86
'max_depth': 2,
87
'max_features': 3,
88
}
89
new_run["parameters"] = PARAMS
90
91
# Calculate the test score and log it to Neptune
92
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
93
new_run["metrics/test_score"] = score
94
95
# Stop logging to the active Neptune Run
96
new_run.stop()
97
baseline_run.stop()
98
#
99
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
100
#
Copied!
  • Save dataset versions as Neptune artifacts with the track_files() method:
snippet
full script
1
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
2
run["datasets/test"].track_files(TEST_DATASET_PATH)
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
9
def train_model(params, train_path, test_path):
10
train = pd.read_csv(train_path)
11
test = pd.read_csv(test_path)
12
13
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
14
'petal.width']
15
TARGET_COLUMN = ['variety']
16
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
17
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
18
19
rf = RandomForestClassifier(**params)
20
rf.fit(X_train, y_train)
21
22
score = rf.score(X_test, y_test)
23
return score
24
25
26
#
27
# Run model training and log dataset version, parameter and test score to Neptune
28
#
29
30
# Create a Neptune Run and start logging
31
run = neptune.init(project='common/data-versioning',
32
api_token='ANONYMOUS')
33
34
# Track dataset version
35
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
36
run["datasets/test"].track_files(TEST_DATASET_PATH)
37
38
# Log parameters
39
PARAMS = {'n_estimators': 3,
40
'max_depth': 3,
41
'max_features': 1,
42
}
43
run["parameters"] = PARAMS
44
45
# Caclulate and log test score
46
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
47
run["metrics/test_score"] = score
48
49
# Get Neptune Run ID of the first, baseline model training run
50
baseline_run_id = run['sys/id'].fetch()
51
print(baseline_run_id)
52
53
# Stop logging to the active Neptune Run
54
run.stop()
55
56
#
57
# Run model training with different parameters and log metadata to Neptune
58
#
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Track dataset version
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Query the baseline Neptune run
69
baseline_run = neptune.init(project='common/data-versioning',
70
api_token='ANONYMOUS',
71
run=baseline_run_id,
72
mode="read-only")
73
74
# Fetch the dataset version of the baseline model training run
75
baseline_run["datasets/train"].fetch_hash()
76
77
# Check if dataset versions changed or not between the runs
78
new_run.wait() # force asynchronous logging operations to finish
79
assert baseline_run["datasets/train"].fetch_hash() == new_run[
80
"datasets/train"].fetch_hash()
81
assert baseline_run["datasets/test"].fetch_hash() == new_run[
82
"datasets/test"].fetch_hash()
83
84
# Select new parameters and log them to Neptune
85
PARAMS = {'n_estimators': 3,
86
'max_depth': 2,
87
'max_features': 3,
88
}
89
new_run["parameters"] = PARAMS
90
91
# Calculate the test score and log it to Neptune
92
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
93
new_run["metrics/test_score"] = score
94
95
# Stop logging to the active Neptune Run
96
new_run.stop()
97
baseline_run.stop()
98
#
99
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
100
#
Copied!
Logging dataset versions at a project level as well makes it easy to organize and collaborate on them.
1
project = neptune.init_project(name="common/data-versioning",
2
api_token="ANONYMOUS")
3
project["datasets/train"].track_files(TRAIN_DATASET_PATH)
Copied!
For more information, see Organize and share dataset versions.
You can also version the entire dataset folder:
1
run["datasets_tables"].track_files('../datasets/tables')
Copied!

Step 3: Run model training and log parameters and metrics to Neptune

  • Log parameters to Neptune:
snippet
full script
1
PARAMS = {'n_estimators': 5,
2
'max_depth':1,
3
'max_features':2,
4
}
5
run["parameters"] = PARAMS
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
9
def train_model(params, train_path, test_path):
10
train = pd.read_csv(train_path)
11
test = pd.read_csv(test_path)
12
13
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
14
'petal.width']
15
TARGET_COLUMN = ['variety']
16
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
17
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
18
19
rf = RandomForestClassifier(**params)
20
rf.fit(X_train, y_train)
21
22
score = rf.score(X_test, y_test)
23
return score
24
25
26
#
27
# Run model training and log dataset version, parameter and test score to Neptune
28
#
29
30
# Create a Neptune Run and start logging
31
run = neptune.init(project='common/data-versioning',
32
api_token='ANONYMOUS')
33
34
# Track dataset version
35
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
36
run["datasets/test"].track_files(TEST_DATASET_PATH)
37
38
# Log parameters
39
PARAMS = {'n_estimators': 3,
40
'max_depth': 3,
41
'max_features': 1,
42
}
43
run["parameters"] = PARAMS
44
45
# Caclulate and log test score
46
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
47
run["metrics/test_score"] = score
48
49
# Get Neptune Run ID of the first, baseline model training run
50
baseline_run_id = run['sys/id'].fetch()
51
print(baseline_run_id)
52
53
# Stop logging to the active Neptune Run
54
run.stop()
55
56
#
57
# Run model training with different parameters and log metadata to Neptune
58
#
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Track dataset version
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Query the baseline Neptune run
69
baseline_run = neptune.init(project='common/data-versioning',
70
api_token='ANONYMOUS',
71
run=baseline_run_id,
72
mode="read-only")
73
74
# Fetch the dataset version of the baseline model training run
75
baseline_run["datasets/train"].fetch_hash()
76
77
# Check if dataset versions changed or not between the runs
78
new_run.wait() # force asynchronous logging operations to finish
79
assert baseline_run["datasets/train"].fetch_hash() == new_run[
80
"datasets/train"].fetch_hash()
81
assert baseline_run["datasets/test"].fetch_hash() == new_run[
82
"datasets/test"].fetch_hash()
83
84
# Select new parameters and log them to Neptune
85
PARAMS = {'n_estimators': 3,
86
'max_depth': 2,
87
'max_features': 3,
88
}
89
new_run["parameters"] = PARAMS
90
91
# Calculate the test score and log it to Neptune
92
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
93
new_run["metrics/test_score"] = score
94
95
# Stop logging to the active Neptune Run
96
new_run.stop()
97
baseline_run.stop()
98
#
99
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
100
#
Copied!
  • Log the score on the test set to Neptune:
snippet
full script
1
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
2
run["metrics/test_score"] = score
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
9
def train_model(params, train_path, test_path):
10
train = pd.read_csv(train_path)
11
test = pd.read_csv(test_path)
12
13
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
14
'petal.width']
15
TARGET_COLUMN = ['variety']
16
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
17
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
18
19
rf = RandomForestClassifier(**params)
20
rf.fit(X_train, y_train)
21
22
score = rf.score(X_test, y_test)
23
return score
24
25
26
#
27
# Run model training and log dataset version, parameter and test score to Neptune
28
#
29
30
# Create a Neptune Run and start logging
31
run = neptune.init(project='common/data-versioning',
32
api_token='ANONYMOUS')
33
34
# Track dataset version
35
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
36
run["datasets/test"].track_files(TEST_DATASET_PATH)
37
38
# Log parameters
39
PARAMS = {'n_estimators': 3,
40
'max_depth': 3,
41
'max_features': 1,
42
}
43
run["parameters"] = PARAMS
44
45
# Caclulate and log test score
46
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
47
run["metrics/test_score"] = score
48
49
# Get Neptune Run ID of the first, baseline model training run
50
baseline_run_id = run['sys/id'].fetch()
51
print(baseline_run_id)
52
53
# Stop logging to the active Neptune Run
54
run.stop()
55
56
#
57
# Run model training with different parameters and log metadata to Neptune
58
#
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Track dataset version
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Query the baseline Neptune run
69
baseline_run = neptune.init(project='common/data-versioning',
70
api_token='ANONYMOUS',
71
run=baseline_run_id,
72
mode="read-only")
73
74
# Fetch the dataset version of the baseline model training run
75
baseline_run["datasets/train"].fetch_hash()
76
77
# Check if dataset versions changed or not between the runs
78
new_run.wait() # force asynchronous logging operations to finish
79
assert baseline_run["datasets/train"].fetch_hash() == new_run[
80
"datasets/train"].fetch_hash()
81
assert baseline_run["datasets/test"].fetch_hash() == new_run[
82
"datasets/test"].fetch_hash()
83
84
# Select new parameters and log them to Neptune
85
PARAMS = {'n_estimators': 3,
86
'max_depth': 2,
87
'max_features': 3,
88
}
89
new_run["parameters"] = PARAMS
90
91
# Calculate the test score and log it to Neptune
92
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
93
new_run["metrics/test_score"] = score
94
95
# Stop logging to the active Neptune Run
96
new_run.stop()
97
baseline_run.stop()
98
#
99
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
100
#
Copied!
  • Get the run id of your model training from Neptune. This will be useful when asserting the same dataset versions on the baseline and new datasets.
snippet
full script
1
baseline_run_id = run['sys/id'].fetch()
2
print(baseline_run_id)
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
9
def train_model(params, train_path, test_path):
10
train = pd.read_csv(train_path)
11
test = pd.read_csv(test_path)
12
13
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
14
'petal.width']
15
TARGET_COLUMN = ['variety']
16
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
17
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
18
19
rf = RandomForestClassifier(**params)
20
rf.fit(X_train, y_train)
21
22
score = rf.score(X_test, y_test)
23
return score
24
25
26
#
27
# Run model training and log dataset version, parameter and test score to Neptune
28
#
29
30
# Create a Neptune Run and start logging
31
run = neptune.init(project='common/data-versioning',
32
api_token='ANONYMOUS')
33
34
# Track dataset version
35
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
36
run["datasets/test"].track_files(TEST_DATASET_PATH)
37
38
# Log parameters
39
PARAMS = {'n_estimators': 3,
40
'max_depth': 3,
41
'max_features': 1,
42
}
43
run["parameters"] = PARAMS
44
45
# Caclulate and log test score
46
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
47
run["metrics/test_score"] = score
48
49
# Get Neptune Run ID of the first, baseline model training run
50
baseline_run_id = run['sys/id'].fetch()
51
print(baseline_run_id)
52
53
# Stop logging to the active Neptune Run
54
run.stop()
55
56
#
57
# Run model training with different parameters and log metadata to Neptune
58
#
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Track dataset version
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Query the baseline Neptune run
69
baseline_run = neptune.init(project='common/data-versioning',
70
api_token='ANONYMOUS',
71
run=baseline_run_id,
72
mode="read-only")
73
74
# Fetch the dataset version of the baseline model training run
75
baseline_run["datasets/train"].fetch_hash()
76
77
# Check if dataset versions changed or not between the runs
78
new_run.wait() # force asynchronous logging operations to finish
79
assert baseline_run["datasets/train"].fetch_hash() == new_run[
80
"datasets/train"].fetch_hash()
81
assert baseline_run["datasets/test"].fetch_hash() == new_run[
82
"datasets/test"].fetch_hash()
83
84
# Select new parameters and log them to Neptune
85
PARAMS = {'n_estimators': 3,
86
'max_depth': 2,
87
'max_features': 3,
88
}
89
new_run["parameters"] = PARAMS
90
91
# Calculate the test score and log it to Neptune
92
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
93
new_run["metrics/test_score"] = score
94
95
# Stop logging to the active Neptune Run
96
new_run.stop()
97
baseline_run.stop()
98
#
99
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
100
#
Copied!
You should get a run id like DAT-25.
  • Stop logging to the current Neptune run:
snippet
full script
1
run.stop()
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
9
def train_model(params, train_path, test_path):
10
train = pd.read_csv(train_path)
11
test = pd.read_csv(test_path)
12
13
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
14
'petal.width']
15
TARGET_COLUMN = ['variety']
16
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
17
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
18
19
rf = RandomForestClassifier(**params)
20
rf.fit(X_train, y_train)
21
22
score = rf.score(X_test, y_test)
23
return score
24
25
26
#
27
# Run model training and log dataset version, parameter and test score to Neptune
28
#
29
30
# Create a Neptune Run and start logging
31
run = neptune.init(project='common/data-versioning',
32
api_token='ANONYMOUS')
33
34
# Track dataset version
35
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
36
run["datasets/test"].track_files(TEST_DATASET_PATH)
37
38
# Log parameters
39
PARAMS = {'n_estimators': 3,
40
'max_depth': 3,
41
'max_features': 1,
42
}
43
run["parameters"] = PARAMS
44
45
# Caclulate and log test score
46
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
47
run["metrics/test_score"] = score
48
49
# Get Neptune Run ID of the first, baseline model training run
50
baseline_run_id = run['sys/id'].fetch()
51
print(baseline_run_id)
52
53
# Stop logging to the active Neptune Run
54
run.stop()
55
56
#
57
# Run model training with different parameters and log metadata to Neptune
58
#
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Track dataset version
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Query the baseline Neptune run
69
baseline_run = neptune.init(project='common/data-versioning',
70
api_token='ANONYMOUS',
71
run=baseline_run_id,
72
mode="read-only")
73
74
# Fetch the dataset version of the baseline model training run
75
baseline_run["datasets/train"].fetch_hash()
76
77
# Check if dataset versions changed or not between the runs
78
new_run.wait() # force asynchronous logging operations to finish
79
assert baseline_run["datasets/train"].fetch_hash() == new_run[
80
"datasets/train"].fetch_hash()
81
assert baseline_run["datasets/test"].fetch_hash() == new_run[
82
"datasets/test"].fetch_hash()
83
84
# Select new parameters and log them to Neptune
85
PARAMS = {'n_estimators': 3,
86
'max_depth': 2,
87
'max_features': 3,
88
}
89
new_run["parameters"] = PARAMS
90
91
# Calculate the test score and log it to Neptune
92
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
93
new_run["metrics/test_score"] = score
94
95
# Stop logging to the active Neptune Run
96
new_run.stop()
97
baseline_run.stop()
98
#
99
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!
100
#
Copied!
  • Run training:
1
python train_model.py
Copied!

Step 4: Add a version check for the training and testing datasets

You can fetch the dataset version hash from the baseline and compare it with the current version of the dataset.
To do that:
  • Create a new Neptune run and track the dataset version:
snippet
full script
1
new_run = neptune.init(project="common/data-versioning",
2
api_token="ANONYMOUS")
3
4
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
5
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
9
def train_model(params, train_path, test_path):
10
train = pd.read_csv(train_path)
11
test = pd.read_csv(test_path)
12
13
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
14
'petal.width']
15
TARGET_COLUMN = ['variety']
16
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
17
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
18
19
rf = RandomForestClassifier(**params)
20
rf.fit(X_train, y_train)
21
22
score = rf.score(X_test, y_test)
23
return score
24
25
26
#
27
# Run model training and log dataset version, parameter and test score to Neptune
28
#
29
30
# Create a Neptune Run and start logging
31
run = neptune.init(project='common/data-versioning',
32
api_token='ANONYMOUS')
33
34
# Track dataset version
35
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
36
run["datasets/test"].track_files(TEST_DATASET_PATH)
37
38
# Log parameters
39
PARAMS = {'n_estimators': 3,
40
'max_depth': 3,
41
'max_features': 1,
42
}
43
run["parameters"] = PARAMS
44
45
# Caclulate and log test score
46
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
47
run["metrics/test_score"] = score
48
49
# Get Neptune Run ID of the first, baseline model training run
50
baseline_run_id = run['sys/id'].fetch()
51
print(baseline_run_id)
52
53
# Stop logging to the active Neptune Run
54
run.stop()
55
56
#
57
# Run model training with different parameters and log metadata to Neptune
58
#
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Track dataset version
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Query the baseline Neptune run
69
baseline_run = neptune.init(project='common/data-versioning',
70