import neptune.new as neptune
from sklearn.ensemble import RandomForestClassifier
TRAIN_DATASET_PATH = '../datasets/tables/train.csv'
TEST_DATASET_PATH = '../datasets/tables/test.csv'
def train_model(params, train_path, test_path):
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
FEATURE_COLUMNS = ['sepal.length', 'sepal.width', 'petal.length',
TARGET_COLUMN = ['variety']
X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
rf = RandomForestClassifier(**params)
score = rf.score(X_test, y_test)
# Run model training and log dataset version, parameter and test score to Neptune
# Create a Neptune Run and start logging
run = neptune.init(project='common/data-versioning',
run["datasets/train"].track_files(TRAIN_DATASET_PATH)
run["datasets/test"].track_files(TEST_DATASET_PATH)
PARAMS = {'n_estimators': 3,
run["parameters"] = PARAMS
# Caclulate and log test score
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
run["metrics/test_score"] = score
# Get Neptune Run ID of the first, baseline model training run
baseline_run_id = run['sys/id'].fetch()
# Stop logging to the active Neptune Run
# Run model training with different parameters and log metadata to Neptune
# Create a new Neptune Run and start logging
new_run = neptune.init(project='common/data-versioning',
new_run["datasets/train"].track_files(TRAIN_DATASET_PATH)
new_run["datasets/test"].track_files(TEST_DATASET_PATH)
# Query the baseline Neptune run
baseline_run = neptune.init(project='common/data-versioning',
# Fetch the dataset version of the baseline model training run
baseline_run["datasets/train"].fetch_hash()
# Check if dataset versions changed or not between the runs
new_run.wait() # force asynchronous logging operations to finish
assert baseline_run["datasets/train"].fetch_hash() == new_run[
"datasets/train"].fetch_hash()
assert baseline_run["datasets/test"].fetch_hash() == new_run[
"datasets/test"].fetch_hash()
# Select new parameters and log them to Neptune
PARAMS = {'n_estimators': 3,
new_run["parameters"] = PARAMS
# Calculate the test score and log it to Neptune
score = train_model(PARAMS, TRAIN_DATASET_PATH, TEST_DATASET_PATH)
new_run["metrics/test_score"] = score
# Stop logging to the active Neptune Run
# Go to Neptune to see how the results changed making sure that the training dataset versions were the same!