Compare datasets between runs
You can version datasets, models, and other file objects as Artifacts in Neptune.
This guide shows how to:
  • Keep track of the dataset version with Neptune artifacts
  • See if models were trained on the same dataset version
  • Compare datasets in the Neptune UI to see what changed
By the end of this guide, you will train a few models on different dataset versions and compare those versions in the Neptune UI.
Compare dataset versions in the Neptune UI
Keywords: Compare dataset versions, Data versioning, Data version control, Track dataset version

Before you start

Make sure you meet the following prerequisites before starting:
To use artifacts you need at least version 0.10.10 of the Neptune client.
1
pip install neptune-client>=0.10.10
Copied!

Step 1: Prepare a model training script

Create a training script 'train_model.py' where you:
  • Specify dataset paths for training and testing
  • Define model parameters
  • Calculate the score on the test set
snippet
full script
1
import pandas as pd
2
from sklearn.ensemble import RandomForestClassifier
3
4
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
5
TEST_DATASET_PATH = '../datasets/tables/test.csv'
6
PARAMS = {'n_estimators': 5,
7
'max_depth':1,
8
'max_features':2,
9
}
10
11
def train_model(params, train_path, test_path):
12
train = pd.read_csv(train_path)
13
test = pd.read_csv(test_path)
14
15
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
16
TARGET_COLUMN = ['variety']
17
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
18
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
19
20
rf = RandomForestClassifier(**params)
21
rf.fit(X_train, y_train)
22
23
score = rf.score(X_test, y_test)
24
return score
25
26
score = train_model(**PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
PARAMS = {'n_estimators': 7,
9
'max_depth': 2,
10
'max_features': 2,
11
}
12
13
14
def train_model(params, train_path, test_path):
15
train = pd.read_csv(train_path)
16
test = pd.read_csv(test_path)
17
18
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
19
'petal.width']
20
TARGET_COLUMN = ['variety']
21
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
22
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
23
24
rf = RandomForestClassifier(**params)
25
rf.fit(X_train, y_train)
26
27
score = rf.score(X_test, y_test)
28
return score
29
30
31
#
32
# Run model training and log dataset version, parameter and test score to Neptune
33
#
34
35
# Create Neptune Run and start logging
36
run = neptune.init(project='common/data-versioning',
37
api_token='ANONYMOUS')
38
39
# Track dataset version
40
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
41
run["datasets/test"].track_files(TEST_DATASET_PATH)
42
43
# Log parameters
44
run["parameters"] = PARAMS
45
46
# Calculate and log test score
47
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
48
run["metrics/test_score"] = score
49
50
# Stop logging to the active Neptune Run
51
run.stop()
52
53
#
54
# Change the training data
55
# Run model training log dataset version, parameter and test score to Neptune
56
#
57
58
TRAIN_DATASET_PATH = '../datasets/tables/train_v2.csv'
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Log dataset versions
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Log parameters
69
new_run["parameters"] = PARAMS
70
71
# Caclulate and log test score
72
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
73
new_run["metrics/test_score"] = score
74
75
# Stop logging to the active Neptune Run
76
new_run.stop()
77
78
#
79
# Go to Neptune to see how the datasets changed between training runs!
80
#
Copied!

Step 2: Add tracking of the dataset version

snippet
full script
1
import neptune.new as neptune
2
3
run = neptune.init(project='common/data-versioning',
4
api_token='ANONYMOUS')
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
PARAMS = {'n_estimators': 7,
9
'max_depth': 2,
10
'max_features': 2,
11
}
12
13
14
def train_model(params, train_path, test_path):
15
train = pd.read_csv(train_path)
16
test = pd.read_csv(test_path)
17
18
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
19
'petal.width']
20
TARGET_COLUMN = ['variety']
21
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
22
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
23
24
rf = RandomForestClassifier(**params)
25
rf.fit(X_train, y_train)
26
27
score = rf.score(X_test, y_test)
28
return score
29
30
31
#
32
# Run model training and log dataset version, parameter and test score to Neptune
33
#
34
35
# Create Neptune Run and start logging
36
run = neptune.init(project='common/data-versioning',
37
api_token='ANONYMOUS')
38
39
# Track dataset version
40
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
41
run["datasets/test"].track_files(TEST_DATASET_PATH)
42
43
# Log parameters
44
run["parameters"] = PARAMS
45
46
# Calculate and log test score
47
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
48
run["metrics/test_score"] = score
49
50
# Stop logging to the active Neptune Run
51
run.stop()
52
53
#
54
# Change the training data
55
# Run model training log dataset version, parameter and test score to Neptune
56
#
57
58
TRAIN_DATASET_PATH = '../datasets/tables/train_v2.csv'
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Log dataset versions
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Log parameters
69
new_run["parameters"] = PARAMS
70
71
# Caclulate and log test score
72
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
73
new_run["metrics/test_score"] = score
74
75
# Stop logging to the active Neptune Run
76
new_run.stop()
77
78
#
79
# Go to Neptune to see how the datasets changed between training runs!
80
#
Copied!
snippet
full script
1
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
2
run["datasets/test"].track_files(TEST_DATASET_PATH)
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
PARAMS = {'n_estimators': 7,
9
'max_depth': 2,
10
'max_features': 2,
11
}
12
13
14
def train_model(params, train_path, test_path):
15
train = pd.read_csv(train_path)
16
test = pd.read_csv(test_path)
17
18
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
19
'petal.width']
20
TARGET_COLUMN = ['variety']
21
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
22
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
23
24
rf = RandomForestClassifier(**params)
25
rf.fit(X_train, y_train)
26
27
score = rf.score(X_test, y_test)
28
return score
29
30
31
#
32
# Run model training and log dataset version, parameter and test score to Neptune
33
#
34
35
# Create Neptune Run and start logging
36
run = neptune.init(project='common/data-versioning',
37
api_token='ANONYMOUS')
38
39
# Track dataset version
40
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
41
run["datasets/test"].track_files(TEST_DATASET_PATH)
42
43
# Log parameters
44
run["parameters"] = PARAMS
45
46
# Calculate and log test score
47
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
48
run["metrics/test_score"] = score
49
50
# Stop logging to the active Neptune Run
51
run.stop()
52
53
#
54
# Change the training data
55
# Run model training log dataset version, parameter and test score to Neptune
56
#
57
58
TRAIN_DATASET_PATH = '../datasets/tables/train_v2.csv'
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Log dataset versions
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Log parameters
69
new_run["parameters"] = PARAMS
70
71
# Caclulate and log test score
72
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
73
new_run["metrics/test_score"] = score
74
75
# Stop logging to the active Neptune Run
76
new_run.stop()
77
78
#
79
# Go to Neptune to see how the datasets changed between training runs!
80
#
Copied!
It is recommended to also log dataset versions at a project level as well.
1
project = neptune.init_project(name="common/data-versioning",
2
api_token="ANONYMOUS")
3
project["datasets/train"].track_files(TRAIN_DATASET_PATH)
Copied!
For more information read, Organize and share dataset versions.
You can also version the entire dataset folder by running:
1
run["dataset_tables"].track_files('../datasets/tables')
Copied!

Step 3: Run model training and log parameters and metrics to Neptune

  • Log parameters to Neptune
snippet
full script
1
PARAMS = {'n_estimators': 5,
2
'max_depth':1,
3
'max_features':2,
4
}
5
run["parameters"] = PARAMS
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
PARAMS = {'n_estimators': 7,
9
'max_depth': 2,
10
'max_features': 2,
11
}
12
13
14
def train_model(params, train_path, test_path):
15
train = pd.read_csv(train_path)
16
test = pd.read_csv(test_path)
17
18
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
19
'petal.width']
20
TARGET_COLUMN = ['variety']
21
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
22
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
23
24
rf = RandomForestClassifier(**params)
25
rf.fit(X_train, y_train)
26
27
score = rf.score(X_test, y_test)
28
return score
29
30
31
#
32
# Run model training and log dataset version, parameter and test score to Neptune
33
#
34
35
# Create Neptune Run and start logging
36
run = neptune.init(project='common/data-versioning',
37
api_token='ANONYMOUS')
38
39
# Track dataset version
40
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
41
run["datasets/test"].track_files(TEST_DATASET_PATH)
42
43
# Log parameters
44
run["parameters"] = PARAMS
45
46
# Calculate and log test score
47
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
48
run["metrics/test_score"] = score
49
50
# Stop logging to the active Neptune Run
51
run.stop()
52
53
#
54
# Change the training data
55
# Run model training log dataset version, parameter and test score to Neptune
56
#
57
58
TRAIN_DATASET_PATH = '../datasets/tables/train_v2.csv'
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Log dataset versions
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Log parameters
69
new_run["parameters"] = PARAMS
70
71
# Caclulate and log test score
72
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
73
new_run["metrics/test_score"] = score
74
75
# Stop logging to the active Neptune Run
76
new_run.stop()
77
78
#
79
# Go to Neptune to see how the datasets changed between training runs!
80
#
Copied!
  • Log score on the test set to Neptune:
snippet
full script
1
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
2
run["metrics/test_score"] = score
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
PARAMS = {'n_estimators': 7,
9
'max_depth': 2,
10
'max_features': 2,
11
}
12
13
14
def train_model(params, train_path, test_path):
15
train = pd.read_csv(train_path)
16
test = pd.read_csv(test_path)
17
18
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
19
'petal.width']
20
TARGET_COLUMN = ['variety']
21
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
22
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
23
24
rf = RandomForestClassifier(**params)
25
rf.fit(X_train, y_train)
26
27
score = rf.score(X_test, y_test)
28
return score
29
30
31
#
32
# Run model training and log dataset version, parameter and test score to Neptune
33
#
34
35
# Create Neptune Run and start logging
36
run = neptune.init(project='common/data-versioning',
37
api_token='ANONYMOUS')
38
39
# Track dataset version
40
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
41
run["datasets/test"].track_files(TEST_DATASET_PATH)
42
43
# Log parameters
44
run["parameters"] = PARAMS
45
46
# Calculate and log test score
47
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
48
run["metrics/test_score"] = score
49
50
# Stop logging to the active Neptune Run
51
run.stop()
52
53
#
54
# Change the training data
55
# Run model training log dataset version, parameter and test score to Neptune
56
#
57
58
TRAIN_DATASET_PATH = '../datasets/tables/train_v2.csv'
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Log dataset versions
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Log parameters
69
new_run["parameters"] = PARAMS
70
71
# Caclulate and log test score
72
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
73
new_run["metrics/test_score"] = score
74
75
# Stop logging to the active Neptune Run
76
new_run.stop()
77
78
#
79
# Go to Neptune to see how the datasets changed between training runs!
80
#
Copied!
  • Stop logging to the current Neptune Run
snippet
full script
1
run.stop()
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
PARAMS = {'n_estimators': 7,
9
'max_depth': 2,
10
'max_features': 2,
11
}
12
13
14
def train_model(params, train_path, test_path):
15
train = pd.read_csv(train_path)
16
test = pd.read_csv(test_path)
17
18
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
19
'petal.width']
20
TARGET_COLUMN = ['variety']
21
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
22
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
23
24
rf = RandomForestClassifier(**params)
25
rf.fit(X_train, y_train)
26
27
score = rf.score(X_test, y_test)
28
return score
29
30
31
#
32
# Run model training and log dataset version, parameter and test score to Neptune
33
#
34
35
# Create Neptune Run and start logging
36
run = neptune.init(project='common/data-versioning',
37
api_token='ANONYMOUS')
38
39
# Track dataset version
40
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
41
run["datasets/test"].track_files(TEST_DATASET_PATH)
42
43
# Log parameters
44
run["parameters"] = PARAMS
45
46
# Calculate and log test score
47
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
48
run["metrics/test_score"] = score
49
50
# Stop logging to the active Neptune Run
51
run.stop()
52
53
#
54
# Change the training data
55
# Run model training log dataset version, parameter and test score to Neptune
56
#
57
58
TRAIN_DATASET_PATH = '../datasets/tables/train_v2.csv'
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Log dataset versions
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Log parameters
69
new_run["parameters"] = PARAMS
70
71
# Caclulate and log test score
72
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
73
new_run["metrics/test_score"] = score
74
75
# Stop logging to the active Neptune Run
76
new_run.stop()
77
78
#
79
# Go to Neptune to see how the datasets changed between training runs!
80
#
Copied!
  • Run training
1
python train_model.py
Copied!

Step 4: Change training dataset

Change the file path to the training dataset:
snippet
full script
1
TRAIN_DATASET_PATH = '../datasets/tables/train_v2.csv'
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
PARAMS = {'n_estimators': 7,
9
'max_depth': 2,
10
'max_features': 2,
11
}
12
13
14
def train_model(params, train_path, test_path):
15
train = pd.read_csv(train_path)
16
test = pd.read_csv(test_path)
17
18
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
19
'petal.width']
20
TARGET_COLUMN = ['variety']
21
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
22
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
23
24
rf = RandomForestClassifier(**params)
25
rf.fit(X_train, y_train)
26
27
score = rf.score(X_test, y_test)
28
return score
29
30
31
#
32
# Run model training and log dataset version, parameter and test score to Neptune
33
#
34
35
# Create Neptune Run and start logging
36
run = neptune.init(project='common/data-versioning',
37
api_token='ANONYMOUS')
38
39
# Track dataset version
40
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
41
run["datasets/test"].track_files(TEST_DATASET_PATH)
42
43
# Log parameters
44
run["parameters"] = PARAMS
45
46
# Calculate and log test score
47
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
48
run["metrics/test_score"] = score
49
50
# Stop logging to the active Neptune Run
51
run.stop()
52
53
#
54
# Change the training data
55
# Run model training log dataset version, parameter and test score to Neptune
56
#
57
58
TRAIN_DATASET_PATH = '../datasets/tables/train_v2.csv'
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Log dataset versions
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Log parameters
69
new_run["parameters"] = PARAMS
70
71
# Caclulate and log test score
72
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
73
new_run["metrics/test_score"] = score
74
75
# Stop logging to the active Neptune Run
76
new_run.stop()
77
78
#
79
# Go to Neptune to see how the datasets changed between training runs!
80
#
Copied!

Step 5: Run model training on a new training dataset

  • Create a new Neptune Run:
snippet
full script
1
new_run = neptune.init(project="common/data-versioning",
2
api_token="ANONYMOUS")
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
PARAMS = {'n_estimators': 7,
9
'max_depth': 2,
10
'max_features': 2,
11
}
12
13
14
def train_model(params, train_path, test_path):
15
train = pd.read_csv(train_path)
16
test = pd.read_csv(test_path)
17
18
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
19
'petal.width']
20
TARGET_COLUMN = ['variety']
21
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
22
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
23
24
rf = RandomForestClassifier(**params)
25
rf.fit(X_train, y_train)
26
27
score = rf.score(X_test, y_test)
28
return score
29
30
31
#
32
# Run model training and log dataset version, parameter and test score to Neptune
33
#
34
35
# Create Neptune Run and start logging
36
run = neptune.init(project='common/data-versioning',
37
api_token='ANONYMOUS')
38
39
# Track dataset version
40
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
41
run["datasets/test"].track_files(TEST_DATASET_PATH)
42
43
# Log parameters
44
run["parameters"] = PARAMS
45
46
# Calculate and log test score
47
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
48
run["metrics/test_score"] = score
49
50
# Stop logging to the active Neptune Run
51
run.stop()
52
53
#
54
# Change the training data
55
# Run model training log dataset version, parameter and test score to Neptune
56
#
57
58
TRAIN_DATASET_PATH = '../datasets/tables/train_v2.csv'
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Log dataset versions
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Log parameters
69
new_run["parameters"] = PARAMS
70
71
# Caclulate and log test score
72
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
73
new_run["metrics/test_score"] = score
74
75
# Stop logging to the active Neptune Run
76
new_run.stop()
77
78
#
79
# Go to Neptune to see how the datasets changed between training runs!
80
#
Copied!
  • Log new dataset versions
snippet
full script
1
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
2
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
PARAMS = {'n_estimators': 7,
9
'max_depth': 2,
10
'max_features': 2,
11
}
12
13
14
def train_model(params, train_path, test_path):
15
train = pd.read_csv(train_path)
16
test = pd.read_csv(test_path)
17
18
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
19
'petal.width']
20
TARGET_COLUMN = ['variety']
21
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
22
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
23
24
rf = RandomForestClassifier(**params)
25
rf.fit(X_train, y_train)
26
27
score = rf.score(X_test, y_test)
28
return score
29
30
31
#
32
# Run model training and log dataset version, parameter and test score to Neptune
33
#
34
35
# Create Neptune Run and start logging
36
run = neptune.init(project='common/data-versioning',
37
api_token='ANONYMOUS')
38
39
# Track dataset version
40
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
41
run["datasets/test"].track_files(TEST_DATASET_PATH)
42
43
# Log parameters
44
run["parameters"] = PARAMS
45
46
# Calculate and log test score
47
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
48
run["metrics/test_score"] = score
49
50
# Stop logging to the active Neptune Run
51
run.stop()
52
53
#
54
# Change the training data
55
# Run model training log dataset version, parameter and test score to Neptune
56
#
57
58
TRAIN_DATASET_PATH = '../datasets/tables/train_v2.csv'
59
60
# Create a new Neptune Run and start logging
61
new_run = neptune.init(project='common/data-versioning',
62
api_token='ANONYMOUS')
63
64
# Log dataset versions
65
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
66
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
67
68
# Log parameters
69
new_run["parameters"] = PARAMS
70
71
# Caclulate and log test score
72
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
73
new_run["metrics/test_score"] = score
74
75
# Stop logging to the active Neptune Run
76
new_run.stop()
77
78
#
79
# Go to Neptune to see how the datasets changed between training runs!
80
#
Copied!
  • Log parameters and test score
snippet
full script
1
new_run["parameters"] = PARAMS
2
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
3
new_run["metrics/test_score"] = score
Copied!
train_model.py
1
import neptune.new as neptune
2
import pandas as pd
3
from sklearn.ensemble import RandomForestClassifier
4
5
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
6
TEST_DATASET_PATH = '../datasets/tables/test.csv'
7
8
PARAMS = {'n_estimators': 7,
9
'max_depth': 2,
10
'max_features': 2,
11
}
12
13
14
def train_model(params, train_path, test_path):
15
train = pd.read_csv(train_path)
16
test = pd.read_csv(test_path)
17
18
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
19
'petal.width']
20
TARGET_COLUMN = ['variety']
21
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
22
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
23
24
rf = RandomForestClassifier(**params)
25
rf.fit(X_train, y_train)
26
27
score = rf.score(X_test, y_test)
28
return score
29
30
31
#
32
# Run model training and log dataset version, parameter and test score to Neptune
33
#
34
35
# Create Neptune Run and start logging
36
run = neptune.init(project='common/data-versioning',
37
api_token='ANONYMOUS')
38
39
# Track dataset version
40
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
41
run["datasets/test"].track_files(TEST_DATASET_PATH)
42