Skip to content

Training the model

Let us define the training function.

%%writefile "src/train-model.py"
import os
import pandas as pd
import numpy as np
from pickle import dump
import sklearn.metrics
from digitalhub_runtime_python import handler
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

@handler(outputs=["model"])
def train_model(project, di):
    """
    Train an SVM classifier on the breast cancer dataset and log metrics
    """
    df_cancer = di.as_df()
    X = df_cancer.drop(["target"], axis=1)
    y = df_cancer["target"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)
    svc_model = SVC()
    svc_model.fit(X_train, y_train)
    y_predict = svc_model.predict(X_test)

    if not os.path.exists("model"):
        os.makedirs("model")

    with open("model/breast_cancer_classifier.pkl", "wb") as f:
        dump(svc_model, f, protocol=5)

    metrics = {
        "f1_score": sklearn.metrics.f1_score(y_test, y_predict),
        "accuracy": sklearn.metrics.accuracy_score(y_test, y_predict),
        "precision": sklearn.metrics.precision_score(y_test, y_predict),
        "recall": sklearn.metrics.recall_score(y_test, y_predict),
    }
    model = project.log_model(name="breast_cancer_classifier", kind="sklearn", source="./model/")
    model.log_metrics(metrics)
    return model

The function takes the analysis dataset as input, creates an SVC model with the scikit-learn framework and logs the model with its metrics.

Let us register it:

train_fn = project.new_function(
    name="train-classifier",
    kind="python",
    python_version="PYTHON3_10",
    code_src="src/train-model.py",
    handler="train_model",
    requirements=["numpy<2"],
)

and run it:

dataset = gen_data_run.output("dataset")
train_run = train_fn.run(action="job", inputs={"di": dataset.key}, wait=True)

As a result, a new model is registered in the Core and may be used by different inference operations:

model = train_run.output("model")
model.spec.path

Lastly, we'll deploy and test the model.