Organize and share dataset versions
You can log and query metadata at a project level, including dataset and model versions, text notes, images, notebook files, and anything else you can log to a single Run.
This guide shows how to:
  • Log versions of all the datasets used in a project
  • Organize dataset version metadata in the Neptune UI
  • Share all the currently used dataset versions with your team
  • Assert that you are training on the latest dataset version available
By the end of this guide, you will log various dataset versions, organize them in the Neptune UI and see how to share them with a persistent link.
Organize dataset versions in Neptune UI
Keywords: Organize dataset versions, Data versioning, Data version control, Track dataset version

Before you start

Make sure you meet the following prerequisites before starting:
To use Project metadata you need at least version 0.14 of the Neptune client.
1
pip install neptune-client>=0.14
Copied!

Step 1: Log various dataset versions to Neptune

snippet
full script
1
import neptune.new as neptune
2
3
project = neptune.init_project(name="common/data-versioning",
4
api_token="ANONYMOUS")
Copied!
train_model.py
1
import neptune.new as neptune
2
from sklearn.ensemble import RandomForestClassifier
3
import pandas as pd
4
import neptune.new as neptune
5
6
# Initialize Neptune project
7
project = neptune.init_project(name="common/data-versioning", api_token="ANONYMOUS")
8
9
# Create a few versions of a dataset and save them to Neptune
10
train = pd.read_csv('../datasets/tables/train.csv')
11
12
for i in range(5):
13
train_sample=train.sample(frac=0.5 + 0.1*i)
14
train_sample.to_csv('../datasets/tables/train_sampled.csv', index=None)
15
project[f'datasets/train_sampled/v{i}'].track_files('../datasets/tables/train_sampled.csv', wait=True)
16
17
print(project.get_structure())
18
19
# Get the latest version of the dataset and save it as 'latest'
20
21
def get_latest_version():
22
artifact_name = project.get_structure()['datasets']['train_sampled'].keys()
23
versions = [int(version.replace('v','')) for version in artifact_name if version != 'latest']
24
latest_version = max(versions)
25
return latest_version
26
27
latest_version = get_latest_version()
28
print('latest version', latest_version)
29
30
project['datasets/train_sampled/latest'].assign(project[f'datasets/train_sampled/v{latest_version}'].fetch(), wait=True)
31
32
print(project.get_structure()['datasets'])
33
34
# Create a Neptune run
35
run = neptune.init(project='common/data-versioning', api_token='ANONYMOUS')
36
37
# Assert that you are training on the latest dataset
38
TRAIN_DATASET_PATH = '../datasets/tables/train_sampled.csv'
39
run["datasets/train"].track_files(TRAIN_DATASET_PATH, wait=True)
40
41
assert run["datasets/train"].fetch_hash() == project['datasets/train_sampled/latest'].fetch_hash()
42
43
TEST_DATASET_PATH = '../datasets/tables/test.csv'
44
45
# Log parameters
46
PARAMS = {'n_estimators': 8,
47
'max_depth':3,
48
'max_features':2,
49
}
50
run["parameters"] = PARAMS
51
52
# Train the model
53
train = pd.read_csv(TRAIN_DATASET_PATH)
54
test = pd.read_csv(TEST_DATASET_PATH)
55
56
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
57
TARGET_COLUMN = ['variety']
58
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
59
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
60
61
rf = RandomForestClassifier(**PARAMS)
62
rf.fit(X_train, y_train)
63
64
# Save the score
65
score = rf.score(X_test, y_test)
66
run["metrics/test_score"] = score
67
68
#
69
# Go to Neptune UI to see datasets logged at the Project level!
70
#
Copied!
snippet
full script
1
import pandas as pd
2
3
train = pd.read_csv('../datasets/tables/train.csv')
4
5
for i in range(5):
6
train_sample=train.sample(frac=0.8)
7
train_sample.to_csv('../datasets/tables/train_sampled.csv', index=None)
8
project[f'datasets/train_sampled/v{i}'].track_files('../datasets/tables/train_sampled.csv', wait=True)
Copied!
train_model.py
1
import neptune.new as neptune
2
from sklearn.ensemble import RandomForestClassifier
3
import pandas as pd
4
import neptune.new as neptune
5
6
# Initialize Neptune project
7
project = neptune.init_project(name="common/data-versioning", api_token="ANONYMOUS")
8
9
# Create a few versions of a dataset and save them to Neptune
10
train = pd.read_csv('../datasets/tables/train.csv')
11
12
for i in range(5):
13
train_sample=train.sample(frac=0.5 + 0.1*i)
14
train_sample.to_csv('../datasets/tables/train_sampled.csv', index=None)
15
project[f'datasets/train_sampled/v{i}'].track_files('../datasets/tables/train_sampled.csv', wait=True)
16
17
print(project.get_structure())
18
19
# Get the latest version of the dataset and save it as 'latest'
20
21
def get_latest_version():
22
artifact_name = project.get_structure()['datasets']['train_sampled'].keys()
23
versions = [int(version.replace('v','')) for version in artifact_name if version != 'latest']
24
latest_version = max(versions)
25
return latest_version
26
27
latest_version = get_latest_version()
28
print('latest version', latest_version)
29
30
project['datasets/train_sampled/latest'].assign(project[f'datasets/train_sampled/v{latest_version}'].fetch(), wait=True)
31
32
print(project.get_structure()['datasets'])
33
34
# Create a Neptune run
35
run = neptune.init(project='common/data-versioning', api_token='ANONYMOUS')
36
37
# Assert that you are training on the latest dataset
38
TRAIN_DATASET_PATH = '../datasets/tables/train_sampled.csv'
39
run["datasets/train"].track_files(TRAIN_DATASET_PATH, wait=True)
40
41
assert run["datasets/train"].fetch_hash() == project['datasets/train_sampled/latest'].fetch_hash()
42
43
TEST_DATASET_PATH = '../datasets/tables/test.csv'
44
45
# Log parameters
46
PARAMS = {'n_estimators': 8,
47
'max_depth':3,
48
'max_features':2,
49
}
50
run["parameters"] = PARAMS
51
52
# Train the model
53
train = pd.read_csv(TRAIN_DATASET_PATH)
54
test = pd.read_csv(TEST_DATASET_PATH)
55
56
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
57
TARGET_COLUMN = ['variety']
58
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
59
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
60
61
rf = RandomForestClassifier(**PARAMS)
62
rf.fit(X_train, y_train)
63
64
# Save the score
65
score = rf.score(X_test, y_test)
66
run["metrics/test_score"] = score
67
68
#
69
# Go to Neptune UI to see datasets logged at the Project level!
70
#
Copied!
Use wait=True to ensure all the logging operations are finished.
  • Save the latest dataset version as a new Artifact 'latest'
snippet
full script
1
def get_latest_version():
2
artifact_name = project.get_structure()['datasets']['train_sampled'].keys()
3
versions = [int(version.replace('v','')) for version in artifact_name if version != 'latest']
4
latest_version = max(versions)
5
return latest_version
6
7
latest_version = get_latest_version()
8
project['datasets/train_sampled/latest'].assign(
9
project[f'datasets/train_sampled/v{latest_version}'].fetch(), wait=True)
Copied!
train_model.py
1
import neptune.new as neptune
2
from sklearn.ensemble import RandomForestClassifier
3
import pandas as pd
4
import neptune.new as neptune
5
6
# Initialize Neptune project
7
project = neptune.init_project(name="common/data-versioning", api_token="ANONYMOUS")
8
9
# Create a few versions of a dataset and save them to Neptune
10
train = pd.read_csv('../datasets/tables/train.csv')
11
12
for i in range(5):
13
train_sample=train.sample(frac=0.5 + 0.1*i)
14
train_sample.to_csv('../datasets/tables/train_sampled.csv', index=None)
15
project[f'datasets/train_sampled/v{i}'].track_files('../datasets/tables/train_sampled.csv', wait=True)
16
17
print(project.get_structure())
18
19
# Get the latest version of the dataset and save it as 'latest'
20
21
def get_latest_version():
22
artifact_name = project.get_structure()['datasets']['train_sampled'].keys()
23
versions = [int(version.replace('v','')) for version in artifact_name if version != 'latest']
24
latest_version = max(versions)
25
return latest_version
26
27
latest_version = get_latest_version()
28
print('latest version', latest_version)
29
30
project['datasets/train_sampled/latest'].assign(project[f'datasets/train_sampled/v{latest_version}'].fetch(), wait=True)
31
32
print(project.get_structure()['datasets'])
33
34
# Create a Neptune run
35
run = neptune.init(project='common/data-versioning', api_token='ANONYMOUS')
36
37
# Assert that you are training on the latest dataset
38
TRAIN_DATASET_PATH = '../datasets/tables/train_sampled.csv'
39
run["datasets/train"].track_files(TRAIN_DATASET_PATH, wait=True)
40
41
assert run["datasets/train"].fetch_hash() == project['datasets/train_sampled/latest'].fetch_hash()
42
43
TEST_DATASET_PATH = '../datasets/tables/test.csv'
44
45
# Log parameters
46
PARAMS = {'n_estimators': 8,
47
'max_depth':3,
48
'max_features':2,
49
}
50
run["parameters"] = PARAMS
51
52
# Train the model
53
train = pd.read_csv(TRAIN_DATASET_PATH)
54
test = pd.read_csv(TEST_DATASET_PATH)
55
56
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
57
TARGET_COLUMN = ['variety']
58
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
59
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
60
61
rf = RandomForestClassifier(**PARAMS)
62
rf.fit(X_train, y_train)
63
64
# Save the score
65
score = rf.score(X_test, y_test)
66
run["metrics/test_score"] = score
67
68
#
69
# Go to Neptune UI to see datasets logged at the Project level!
70
#
Copied!

Step 2: Access dataset versions via API or in the Neptune UI

snippet
full script
1
project.get_structure()['datasets']
Copied!
train_model.py
1
import neptune.new as neptune
2
from sklearn.ensemble import RandomForestClassifier
3
import pandas as pd
4
import neptune.new as neptune
5
6
# Initialize Neptune project
7
project = neptune.init_project(name="common/data-versioning", api_token="ANONYMOUS")
8
9
# Create a few versions of a dataset and save them to Neptune
10
train = pd.read_csv('../datasets/tables/train.csv')
11
12
for i in range(5):
13
train_sample=train.sample(frac=0.5 + 0.1*i)
14
train_sample.to_csv('../datasets/tables/train_sampled.csv', index=None)
15
project[f'datasets/train_sampled/v{i}'].track_files('../datasets/tables/train_sampled.csv', wait=True)
16
17
print(project.get_structure())
18
19
# Get the latest version of the dataset and save it as 'latest'
20
21
def get_latest_version():
22
artifact_name = project.get_structure()['datasets']['train_sampled'].keys()
23
versions = [int(version.replace('v','')) for version in artifact_name if version != 'latest']
24
latest_version = max(versions)
25
return latest_version
26
27
latest_version = get_latest_version()
28
print('latest version', latest_version)
29
30
project['datasets/train_sampled/latest'].assign(project[f'datasets/train_sampled/v{latest_version}'].fetch(), wait=True)
31
32
print(project.get_structure()['datasets'])
33
34
# Create a Neptune run
35
run = neptune.init(project='common/data-versioning', api_token='ANONYMOUS')
36
37
# Assert that you are training on the latest dataset
38
TRAIN_DATASET_PATH = '../datasets/tables/train_sampled.csv'
39
run["datasets/train"].track_files(TRAIN_DATASET_PATH, wait=True)
40
41
assert run["datasets/train"].fetch_hash() == project['datasets/train_sampled/latest'].fetch_hash()
42
43
TEST_DATASET_PATH = '../datasets/tables/test.csv'
44
45
# Log parameters
46
PARAMS = {'n_estimators': 8,
47
'max_depth':3,
48
'max_features':2,
49
}
50
run["parameters"] = PARAMS
51
52
# Train the model
53
train = pd.read_csv(TRAIN_DATASET_PATH)
54
test = pd.read_csv(TEST_DATASET_PATH)
55
56
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
57
TARGET_COLUMN = ['variety']
58
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
59
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
60
61
rf = RandomForestClassifier(**PARAMS)
62
rf.fit(X_train, y_train)
63
64
# Save the score
65
score = rf.score(X_test, y_test)
66
run["metrics/test_score"] = score
67
68
#
69
# Go to Neptune UI to see datasets logged at the Project level!
70
#
Copied!
  • Go to the Neptune UI and find Project Metadata > datasets to see all available dataset versions
Dataset versions in the Neptune UI
  • Share all available datasets with your team with a persistent link like this one

Step 3: Assert that you are training on the latest dataset

snippet
full script
1
run = neptune.init(project='common/data-versioning',
2
api_token='ANONYMOUS')
Copied!
1
import neptune.new as neptune
2
from sklearn.ensemble import RandomForestClassifier
3
import pandas as pd
4
import neptune.new as neptune
5
6
# Initialize Neptune project
7
project = neptune.init_project(name="common/data-versioning", api_token="ANONYMOUS")
8
9
# Create a few versions of a dataset and save them to Neptune
10
train = pd.read_csv('../datasets/tables/train.csv')
11
12
for i in range(5):
13
train_sample=train.sample(frac=0.5 + 0.1*i)
14
train_sample.to_csv('../datasets/tables/train_sampled.csv', index=None)
15
project[f'datasets/train_sampled/v{i}'].track_files('../datasets/tables/train_sampled.csv', wait=True)
16
17
print(project.get_structure())
18
19
# Get the latest version of the dataset and save it as 'latest'
20
21
def get_latest_version():
22
artifact_name = project.get_structure()['datasets']['train_sampled'].keys()
23
versions = [int(version.replace('v','')) for version in artifact_name if version != 'latest']
24
latest_version = max(versions)
25
return latest_version
26
27
latest_version = get_latest_version()
28
print('latest version', latest_version)
29
30
project['datasets/train_sampled/latest'].assign(project[f'datasets/train_sampled/v{latest_version}'].fetch(), wait=True)
31
32
print(project.get_structure()['datasets'])
33
34
# Create a Neptune run
35
run = neptune.init(project='common/data-versioning', api_token='ANONYMOUS')
36
37
# Assert that you are training on the latest dataset
38
TRAIN_DATASET_PATH = '../datasets/tables/train_sampled.csv'
39
run["datasets/train"].track_files(TRAIN_DATASET_PATH, wait=True)
40
41
assert run["datasets/train"].fetch_hash() == project['datasets/train_sampled/latest'].fetch_hash()
42
43
TEST_DATASET_PATH = '../datasets/tables/test.csv'
44
45
# Log parameters
46
PARAMS = {'n_estimators': 8,
47
'max_depth':3,
48
'max_features':2,
49
}
50
run["parameters"] = PARAMS
51
52
# Train the model
53
train = pd.read_csv(TRAIN_DATASET_PATH)
54
test = pd.read_csv(TEST_DATASET_PATH)
55
56
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
57
TARGET_COLUMN = ['variety']
58
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
59
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
60
61
rf = RandomForestClassifier(**PARAMS)
62
rf.fit(X_train, y_train)
63
64
# Save the score
65
score = rf.score(X_test, y_test)
66
run["metrics/test_score"] = score
67
68
#
69
# Go to Neptune UI to see datasets logged at the Project level!
70
#
Copied!
snippet
full script
1
TRAIN_DATASET_PATH = '../datasets/tables/train_sampled.csv'
2
run["datasets/train"].track_files(TRAIN_DATASET_PATH, wait=True)
Copied!
1
import neptune.new as neptune
2
from sklearn.ensemble import RandomForestClassifier
3
import pandas as pd
4
import neptune.new as neptune
5
6
# Initialize Neptune project
7
project = neptune.init_project(name="common/data-versioning", api_token="ANONYMOUS")
8
9
# Create a few versions of a dataset and save them to Neptune
10
train = pd.read_csv('../datasets/tables/train.csv')
11
12
for i in range(5):
13
train_sample=train.sample(frac=0.5 + 0.1*i)
14
train_sample.to_csv('../datasets/tables/train_sampled.csv', index=None)
15
project[f'datasets/train_sampled/v{i}'].track_files('../datasets/tables/train_sampled.csv', wait=True)
16
17
print(project.get_structure())
18
19
# Get the latest version of the dataset and save it as 'latest'
20
21
def get_latest_version():
22
artifact_name = project.get_structure()['datasets']['train_sampled'].keys()
23
versions = [int(version.replace('v','')) for version in artifact_name if version != 'latest']
24
latest_version = max(versions)
25
return latest_version
26
27
latest_version = get_latest_version()
28
print('latest version', latest_version)
29
30
project['datasets/train_sampled/latest'].assign(project[f'datasets/train_sampled/v{latest_version}'].fetch(), wait=True)
31
32
print(project.get_structure()['datasets'])
33
34
# Create a Neptune run
35
run = neptune.init(project='common/data-versioning', api_token='ANONYMOUS')
36
37
# Assert that you are training on the latest dataset
38
TRAIN_DATASET_PATH = '../datasets/tables/train_sampled.csv'
39
run["datasets/train"].track_files(TRAIN_DATASET_PATH, wait=True)
40
41
assert run["datasets/train"].fetch_hash() == project['datasets/train_sampled/latest'].fetch_hash()
42
43
TEST_DATASET_PATH = '../datasets/tables/test.csv'
44
45
# Log parameters
46
PARAMS = {'n_estimators': 8,
47
'max_depth':3,
48
'max_features':2,
49
}
50
run["parameters"] = PARAMS
51
52
# Train the model
53
train = pd.read_csv(TRAIN_DATASET_PATH)
54
test = pd.read_csv(TEST_DATASET_PATH)
55
56
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
57
TARGET_COLUMN = ['variety']
58
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
59
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
60
61
rf = RandomForestClassifier(**PARAMS)
62
rf.fit(X_train, y_train)
63
64
# Save the score
65
score = rf.score(X_test, y_test)
66
run["metrics/test_score"] = score
67
68
#
69
# Go to Neptune UI to see datasets logged at the Project level!
70
#
Copied!
  • Assert current dataset is the latest
snippet
full script
1
assert run["datasets/train"].fetch_hash() == project['datasets/train_sampled/latest'].fetch_hash()
Copied!
1
import neptune.new as neptune
2
from sklearn.ensemble import RandomForestClassifier
3
import pandas as pd
4
import neptune.new as neptune
5
6
# Initialize Neptune project
7
project = neptune.init_project(name="common/data-versioning", api_token="ANONYMOUS")
8
9
# Create a few versions of a dataset and save them to Neptune
10
train = pd.read_csv('../datasets/tables/train.csv')
11
12
for i in range(5):
13
train_sample=train.sample(frac=0.5 + 0.1*i)
14
train_sample.to_csv('../datasets/tables/train_sampled.csv', index=None)
15
project[f'datasets/train_sampled/v{i}'].track_files('../datasets/tables/train_sampled.csv', wait=True)
16
17
print(project.get_structure())
18
19
# Get the latest version of the dataset and save it as 'latest'
20
21
def get_latest_version():
22
artifact_name = project.get_structure()['datasets']['train_sampled'].keys()
23
versions = [int(version.replace('v','')) for version in artifact_name if version != 'latest']
24
latest_version = max(versions)
25
return latest_version
26
27
latest_version = get_latest_version()
28
print('latest version', latest_version)
29
30
project['datasets/train_sampled/latest'].assign(project[f'datasets/train_sampled/v{latest_version}'].fetch(), wait=True)
31
32
print(project.get_structure()['datasets'])
33
34
# Create a Neptune run
35
run = neptune.init(project='common/data-versioning', api_token='ANONYMOUS')
36
37
# Assert that you are training on the latest dataset
38
TRAIN_DATASET_PATH = '../datasets/tables/train_sampled.csv'
39
run["datasets/train"].track_files(TRAIN_DATASET_PATH, wait=True)
40
41
assert run["datasets/train"].fetch_hash() == project['datasets/train_sampled/latest'].fetch_hash()
42
43
TEST_DATASET_PATH = '../datasets/tables/test.csv'
44
45
# Log parameters
46
PARAMS = {'n_estimators': 8,
47
'max_depth':3,
48
'max_features':2,
49
}
50
run["parameters"] = PARAMS
51
52
# Train the model
53
train = pd.read_csv(TRAIN_DATASET_PATH)
54
test = pd.read_csv(TEST_DATASET_PATH)
55
56
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
57
TARGET_COLUMN = ['variety']
58
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
59
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
60
61
rf = RandomForestClassifier(**PARAMS)
62
rf.fit(X_train, y_train)
63
64
# Save the score
65
score = rf.score(X_test, y_test)
66
run["metrics/test_score"] = score
67
68
#
69
# Go to Neptune UI to see datasets logged at the Project level!
70
#
Copied!
You can also download the latest version of the dataset by running:
snippet
1
project['datasets/train_sampled/latest'].download()S
Copied!
  • Run model training and log parameters and metrics to Neptune
snippet
full script
1
PARAMS = {'n_estimators': 8,
2
'max_depth':3,
3
'max_features':2,
4
}
5
run["parameters"] = PARAMS
6
7
# Training boilerplate
8
rf = RandomForestClassifier(**PARAMS)
9
rf.fit(X_train, y_train)
10
11
score = rf.score(X_test, y_test)
12
run["metrics/test_score"] = score
Copied!
1
import neptune.new as neptune
2
from sklearn.ensemble import RandomForestClassifier
3
import pandas as pd
4
import neptune.new as neptune
5
6
# Initialize Neptune project
7
project = neptune.init_project(name="common/data-versioning", api_token="ANONYMOUS")
8
9
# Create a few versions of a dataset and save them to Neptune
10
train = pd.read_csv('../datasets/tables/train.csv')
11
12
for i in range(5):
13
train_sample=train.sample(frac=0.5 + 0.1*i)
14
train_sample.to_csv('../datasets/tables/train_sampled.csv', index=None)
15
project[f'datasets/train_sampled/v{i}'].track_files('../datasets/tables/train_sampled.csv', wait=True)
16
17
print(project.get_structure())
18
19
# Get the latest version of the dataset and save it as 'latest'
20
21
def get_latest_version():
22
artifact_name = project.get_structure()['datasets']['train_sampled'].keys()
23
versions = [int(version.replace('v','')) for version in artifact_name if version != 'latest']
24
latest_version = max(versions)
25
return latest_version
26
27
latest_version = get_latest_version()
28
print('latest version', latest_version)
29
30
project['datasets/train_sampled/latest'].assign(project[f'datasets/train_sampled/v{latest_version}'].fetch(), wait=True)
31
32
print(project.get_structure()['datasets'])
33
34
# Create a Neptune run
35
run = neptune.init(project='common/data-versioning', api_token='ANONYMOUS')
36
37
# Assert that you are training on the latest dataset
38
TRAIN_DATASET_PATH = '../datasets/tables/train_sampled.csv'
39
run["datasets/train"].track_files(TRAIN_DATASET_PATH, wait=True)
40
41
assert run["datasets/train"].fetch_hash() == project['datasets/train_sampled/latest'].fetch_hash()
42
43
TEST_DATASET_PATH = '../datasets/tables/test.csv'
44
45
# Log parameters
46
PARAMS = {'n_estimators': 8,
47
'max_depth':3,
48
'max_features':2,
49
}
50
run["parameters"] = PARAMS
51
52
# Train the model
53
train = pd.read_csv(TRAIN_DATASET_PATH)
54
test = pd.read_csv(TEST_DATASET_PATH)
55
56
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
57
TARGET_COLUMN = ['variety']
58
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
59
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
60
61
rf = RandomForestClassifier(**PARAMS)
62
rf.fit(X_train, y_train)
63
64
# Save the score
65
score = rf.score(X_test, y_test)
66
run["metrics/test_score"] = score
67
68
#
69
# Go to Neptune UI to see datasets logged at the Project level!
70
#
Copied!
  • Stop logging to Neptune
snippet
full script
1
run.stop()
2
project.stop()
Copied!
1
import neptune.new as neptune
2
from sklearn.ensemble import RandomForestClassifier
3
import pandas as pd
4
import neptune.new as neptune
5
6
# Initialize Neptune project
7
project = neptune.init_project(name="common/data-versioning", api_token="ANONYMOUS")
8
9
# Create a few versions of a dataset and save them to Neptune
10
train = pd.read_csv('../datasets/tables/train.csv')
11
12
for i in range(5):
13
train_sample=train.sample(frac=0.5 + 0.1*i)
14
train_sample.to_csv('../datasets/tables/train_sampled.csv', index=None)
15
project[f'datasets/train_sampled/v{i}'].track_files('../datasets/tables/train_sampled.csv', wait=True)
16
17
print(project.get_structure())
18
19
# Get the latest version of the dataset and save it as 'latest'
20
21
def get_latest_version():
22
artifact_name = project.get_structure()['datasets']['train_sampled'].keys()
23
versions = [int(version.replace('v','')) for version in artifact_name if version != 'latest']
24
latest_version = max(versions)
25
return latest_version
26
27
latest_version = get_latest_version()
28
print('latest version', latest_version)
29
30
project['datasets/train_sampled/latest'].assign(project[f'datasets/train_sampled/v{latest_version}'].fetch(), wait=True)
31
32
print(project.get_structure()['datasets'])
33
34
# Create a Neptune run
35
run = neptune.init(project='common/data-versioning', api_token='ANONYMOUS')
36
37
# Assert that you are training on the latest dataset
38
TRAIN_DATASET_PATH = '../datasets/tables/train_sampled.csv'
39
run["datasets/train"].track_files(TRAIN_DATASET_PATH, wait=True)
40
41
assert run["datasets/train"].fetch_hash() == project['datasets/train_sampled/latest'].fetch_hash()
42
43
TEST_DATASET_PATH = '../datasets/tables/test.csv'
44
45
# Log parameters
46
PARAMS = {'n_estimators': 8,
47
'max_depth':3,
48
'max_features':2,
49
}
50
run["parameters"] = PARAMS
51
52
# Train the model
53
train = pd.read_csv(TRAIN_DATASET_PATH)
54
test = pd.read_csv(TEST_DATASET_PATH)
55
56
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
57
TARGET_COLUMN = ['variety']
58
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
59
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
60
61
rf = RandomForestClassifier(**PARAMS)
62
rf.fit(X_train, y_train)
63
64
# Save the score
65
score = rf.score(X_test, y_test)
66
run["metrics/test_score"] = score
67
68
#
69
# Go to Neptune UI to see datasets logged at the Project level!
70
#
Copied!

Summary

In this guide you learned:

See also