Skip to content

Working with artifacts: Sharing dataset versions on project-level#

Open in Colab

You can log and query metadata on project level, including dataset and model versions, text notes, images, notebook files, and anything else you can log to a single run.

In this tutorial, we'll track artifacts as project metadata. The flow includes:

  • Logging versions of all the datasets used in a project.
  • Organizing dataset version metadata in the Neptune app.
  • Sharing all the currently used dataset versions with your team.
  • Asserting that you're training on the latest dataset version available.

See results in Neptune  Code examples 

Before you start#

Tip

If you already took the dataset versioning tutorial, or just want to check out the artifact logging without training any models, you can skip to the Track several dataset versions in project metadata section and run the snippets from there.

Prepare a model training script#

To start, create a training script train.py:

import pandas as pd
from sklearn.ensemble import RandomForestClassifier

import neptune.new as neptune

Track several dataset versions in project metadata#

  1. To log project metadata through the API, initialize the project as a Neptune object.

    You can log metadata to it just as you would to a run.

    project = neptune.init_project(
        name="workspace-name/project-name",  # (1)
    )
    
    1. The full project name. For example, "ml-team/classification". To copy it, navigate to the project settingsProperties.
  2. Save a few dataset versions as Neptune artifacts to the project.

    train = pd.read_csv("train.csv")
    
    for i in range(5):
        train_sample = train.sample(frac=0.5 + 0.1 * i)
        train_sample.to_csv("train_sampled.csv", index=None)
        project[f"datasets/train_sampled/v{i}"].track_files(
            "train_sampled.csv", wait=True  # (1)
        )
    
    print(project.get_structure())
    
    1. Use wait=True to ensure that all logging operations are finished. See also: API referencewait()
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    
    import neptune.new as neptune
    
    # Initialize Neptune project
    project = neptune.init_project(
        name="common/data-versioning",
        api_token=neptune.ANONYMOUS_API_TOKEN,
    )
    
    # Create a few versions of a dataset and track them as Neptune artifacts
    train = pd.read_csv("train.csv")
    
    for i in range(5):
        train_sample = train.sample(frac=0.5 + 0.1 * i)
        train_sample.to_csv("train_sampled.csv", index=None)
        project[f"datasets/train_sampled/v{i}"].track_files(
            "train_sampled.csv", wait=True
        )
    
    print(project.get_structure())
    
  3. Save the latest dataset version as a new artifact called "latest":

    def get_latest_version():
        # Get the latest version of the dataset and save it as 'latest'
        artifact_name = project.get_structure()["datasets"]["train_sampled"].keys()
        versions = [
            int(version.replace("v", ""))
            for version in artifact_name
            if version != "latest"
        ]
        latest_version = max(versions)
        return latest_version
    
    latest_version = get_latest_version()
    print("latest version", latest_version)
    
    project["datasets/train_sampled/latest"].assign(
        project[f"datasets/train_sampled/v{latest_version}"].fetch(), wait=True
    )
    
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    
    import neptune.new as neptune
    
    # Initialize Neptune project
    project = neptune.init_project(
        name="common/data-versioning",
        api_token=neptune.ANONYMOUS_API_TOKEN,
    )
    
    # Create a few versions of a dataset and track them as Neptune artifacts
    train = pd.read_csv("train.csv")
    
    for i in range(5):
        train_sample = train.sample(frac=0.5 + 0.1 * i)
        train_sample.to_csv("train_sampled.csv", index=None)
        project[f"datasets/train_sampled/v{i}"].track_files(
            "train_sampled.csv", wait=True
        )
    
    print(project.get_structure())
    
    
    def get_latest_version():
        # Get the latest version of the dataset and save it as 'latest'
        artifact_name = project.get_structure()["datasets"]["train_sampled"].keys()
        versions = [
            int(version.replace("v", ""))
            for version in artifact_name
            if version != "latest"
        ]
        latest_version = max(versions)
        return latest_version
    
    
    latest_version = get_latest_version()
    print("latest version", latest_version)
    
    project["datasets/train_sampled/latest"].assign(
        project[f"datasets/train_sampled/v{latest_version}"].fetch(), wait=True
    )
    

Access dataset versions via API#

You can now list the available dataset versions with the get_structure() method:

print(project.get_structure()["datasets"])
Sample output
{'train_sampled': {'latest': <neptune.new.attributes.atoms.artifact.Artifact object at 0x000001BF367D0DC0>, 'v0': <neptune.new.attributes.atoms.artifact.Artifact object at 0x000001BF367A7700>, 'v1': <neptune.new.attributes.atoms.artifact.Artifact object at 0x000001BF36806DA0>, 'v2': <neptune.new.attributes.atoms.artifact.Artifact object at 0x000001BF36806E00>, 'v3': <neptune.new.attributes.atoms.artifact.Artifact object at 0x000001BF36806E60>, 'v4': <neptune.new.attributes.atoms.artifact.Artifact object at 0x000001BF36806EC0>}}
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

import neptune.new as neptune

# Initialize Neptune project
project = neptune.init_project(
    name="common/data-versioning",
    api_token=neptune.ANONYMOUS_API_TOKEN,
)

# Create a few versions of a dataset and save them to Neptune
train = pd.read_csv("train.csv")

for i in range(5):
    train_sample = train.sample(frac=0.5 + 0.1 * i)
    train_sample.to_csv("train_sampled.csv", index=None)
    project[f"datasets/train_sampled/v{i}"].track_files(
        "train_sampled.csv", wait=True
    )

print(project.get_structure())


def get_latest_version():
    # Get the latest version of the dataset and save it as 'latest'
    artifact_name = project.get_structure()["datasets"]["train_sampled"].keys()
    versions = [
        int(version.replace("v", ""))
        for version in artifact_name
        if version != "latest"
    ]
    latest_version = max(versions)
    return latest_version


latest_version = get_latest_version()
print("latest version", latest_version)

project["datasets/train_sampled/latest"].assign(
    project[f"datasets/train_sampled/v{latest_version}"].fetch(), wait=True
)

print(project.get_structure()["datasets"])

View dataset versions in app#

To view the dataset versions in the Neptune app:

  1. Select the Project metadata tab.
  2. Click the datasets namespace, then the train_sampled namespace.
  3. Select each artifact in the list to preview the metadata on the right.
  4. Click Share to copy and share a persistent link to this view.

See example in Neptune 

Going further: Assert that you're training on the latest dataset#

In this last part, we'll show an example of how you can interact with the tracked artifacts.

We'll fetch the dataset version marked as "latest" and assert that we're using that same version to train our model.

  1. Create a Neptune run:

    run = neptune.init_run()  # (1)
    
    1. If you haven't set up your credentials, you can log anonymously: neptune.init_run(api_token=neptune.ANONYMOUS_API_TOKEN, project="common/data-versioning")
  2. Log the current dataset as an artifact:

    TRAIN_DATASET_PATH = "train_sampled.csv"
    run["datasets/train"].track_files(TRAIN_DATASET_PATH, wait=True)
    
  3. Assert that the current dataset is the latest version:

    assert (
        run["datasets/train"].fetch_hash()
        == project["datasets/train_sampled/latest"].fetch_hash()
    )
    

    Tip

    If you want to download the latest version of the dataset, you can use download() on the artifact field:

    project["datasets/train_sampled/latest"].download()
    
  4. Train the model and log the metadata to Neptune:

    TEST_DATASET_PATH = "test.csv"
    
    # Log parameters
    PARAMS = {
        "n_estimators": 8,
        "max_depth": 3,
        "max_features": 2,
    }
    run["parameters"] = PARAMS
    
    # Train the model
    train = pd.read_csv(TRAIN_DATASET_PATH)
    test = pd.read_csv(TEST_DATASET_PATH)
    
    FEATURE_COLUMNS = [
        "sepal.length",
        "sepal.width",
        "petal.length",
        "petal.width",
    ]
    TARGET_COLUMN = ["variety"]
    X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
    X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
    
    rf = RandomForestClassifier(**PARAMS)
    rf.fit(X_train, y_train)
    
    # Save the score
    score = rf.score(X_test, y_test)
    run["metrics/test_score"] = score
    
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    
    import neptune.new as neptune
    
    # Initialize Neptune project
    project = neptune.init_project(
        name="common/data-versioning",
        api_token=neptune.ANONYMOUS_API_TOKEN,
    )
    
    # Create a few versions of a dataset and track them as Neptune artifacts
    train = pd.read_csv("train.csv")
    
    for i in range(5):
        train_sample = train.sample(frac=0.5 + 0.1 * i)
        train_sample.to_csv("train_sampled.csv", index=None)
        project[f"datasets/train_sampled/v{i}"].track_files(
            "train_sampled.csv", wait=True
        )
    
    print(project.get_structure())
    
    
    def get_latest_version():
        # Get the latest version of the dataset and save it as 'latest'
        artifact_name = project.get_structure()["datasets"]["train_sampled"].keys()
        versions = [
            int(version.replace("v", ""))
            for version in artifact_name
            if version != "latest"
        ]
        latest_version = max(versions)
        return latest_version
    
    
    latest_version = get_latest_version()
    print("latest version", latest_version)
    
    project["datasets/train_sampled/latest"].assign(
        project[f"datasets/train_sampled/v{latest_version}"].fetch(), wait=True
    )
    
    print(project.get_structure()["datasets"])
    
    # Create a Neptune run
    run = neptune.init_run(
        name="common/data-versioning",
        api_token=neptune.ANONYMOUS_API_TOKEN,
    )
    
    # Assert that you're training on the latest dataset version
    TRAIN_DATASET_PATH = "train_sampled.csv"
    run["datasets/train"].track_files(TRAIN_DATASET_PATH, wait=True)
    
    assert (
        run["datasets/train"].fetch_hash()
        == project["datasets/train_sampled/latest"].fetch_hash()
    )
    
    TEST_DATASET_PATH = "test.csv"
    
    # Log parameters
    PARAMS = {
        "n_estimators": 8,
        "max_depth": 3,
        "max_features": 2,
    }
    run["parameters"] = PARAMS
    
    # Train the model
    train = pd.read_csv(TRAIN_DATASET_PATH)
    test = pd.read_csv(TEST_DATASET_PATH)
    
    FEATURE_COLUMNS = [
        "sepal.length",
        "sepal.width",
        "petal.length",
        "petal.width",
    ]
    TARGET_COLUMN = ["variety"]
    X_train, y_train = train[FEATURE_COLUMNS], train[TARGET_COLUMN]
    X_test, y_test = test[FEATURE_COLUMNS], test[TARGET_COLUMN]
    
    rf = RandomForestClassifier(**PARAMS)
    rf.fit(X_train, y_train)
    
    # Save the score
    score = rf.score(X_test, y_test)
    run["metrics/test_score"] = score
    
    #
    # Go to the Neptune app to see datasets logged at the project level!
    #
    
  5. Stop the active Neptune objects:

    run.stop()
    project.stop()
    

To view the run in Neptune, click the link in the console output.

Back to top