Generate classification data

Generate classification data#

Use this function to generate sample data sets, wraps scikit-learn’s make_classification. See the link for a description of all parameters.

# nuclio: ignore
import nuclio
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from typing import Optional, List, Any
from sklearn.datasets import make_classification

from mlrun.execution import MLClientCtx

def gen_class_data(
    context: MLClientCtx,
    n_samples: int,
    m_features: int,
    k_classes: int,
    header: Optional[List[str]],
    label_column: Optional[str] = "labels",
    weight: float = 0.5,
    random_state: int = 1,
    key: str = "classifier-data", 
    file_ext: str = "parquet",
    sk_params = {}
):
    """Create a binary classification sample dataset and save.
    If no filename is given it will default to:
    "simdata-{n_samples}X{m_features}.parquet".
    
    Additional scikit-learn parameters can be set using **sk_params, please see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html for more details.
    
    :param context:       function context
    :param n_samples:     number of rows/samples
    :param m_features:    number of cols/features
    :param k_classes:     number of classes
    :param header:        header for features array
    :param label_column:  column name of ground-truth series
    :param weight:        fraction of sample negative value (ground-truth=0)
    :param random_state:  rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state)
    :param key:           key of data in artifact store
    :param file_ext:      (pqt) extension for parquet file
    :param sk_params:     additional parameters for `sklearn.datasets.make_classification`
    """
    features, labels = make_classification(
        n_samples=n_samples,
        n_features=m_features,
        weights=weight,
        n_classes=k_classes,
        random_state=random_state, 
        **sk_params)

    # make dataframes, add column names, concatenate (X, y)
    X = pd.DataFrame(features)
    if not header:
        X.columns = ["feat_" + str(x) for x in range(m_features)]
    else:
        X.columns = header

    y = pd.DataFrame(labels, columns=[label_column])
    data = pd.concat([X, y], axis=1)
    
    context.log_dataset(key, df=data, format=file_ext, index=False)
# nuclio: end-code

save#

from mlrun import code_to_function
from mlrun.platforms.other import auto_mount

gpus = False

fn_params = {
    "name"        : "gen_class_data",
    "handler"     : "gen_class_data",
    "kind"        : "job",
    "image"       : "mlrun/ml-models" if not gpus else "mlrun/ml-models-gpu",
    "description" : "simulate classification data using scikit-learn",
    "categories"  : ["simulators", "ml"],
    "labels"      : {"author": "yjb", 'framework': 'sklearn'},
}

fn = code_to_function(**fn_params)

fn.export("function.yaml")
fn.apply(auto_mount())
[mlrun] 2020-06-14 10:37:07,647 function spec saved to path: function.yaml
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7faf4a975eb8>

test function#

from mlrun import NewTask, mlconf

task_params = {
    "name":        "tasks generate classification data", 
    "params" : {
        "n_samples"   : 10_000,
        "m_features"  : 5,
        "k_classes"   : 2,
        "weight"      : [0.5, 0.5],
        "sk_params"   : {"n_informative": 2},
        "file_ext"    : "csv"}}

local#

from mlrun import run_local
run_local(NewTask(**task_params), handler=gen_class_data)
[mlrun] 2020-06-14 10:33:01,963 starting run tasks generate classification data uid=1d7c5af7e4b04bd98755c87842455105  -> http://mlrun-api:8080
[mlrun] 2020-06-14 10:33:02,156 log artifact classifier-data at /User/artifacts/classifier-data.csv, size: 998700, db: Y
project uid iter start state name labels inputs parameters results artifacts
default 0 Jun 14 10:33:01 completed tasks generate classification data
v3io_user=admin
kind=handler
owner=admin
host=jupyter-7b44c8d958-kklf7
n_samples=10000
m_features=5
k_classes=2
weight=[0.5, 0.5]
sk_params={'n_informative': 2}
file_ext=csv
classifier-data
to track results use .show() or .logs() or in CLI: 
!mlrun get run 1d7c5af7e4b04bd98755c87842455105 --project default , !mlrun logs 1d7c5af7e4b04bd98755c87842455105 --project default
[mlrun] 2020-06-14 10:33:02,198 run executed, status=completed
<mlrun.model.RunObject at 0x7fafa49fc160>

remote#

run = fn.run(NewTask(**task_params), artifact_path=mlconf.artifact_path)
[mlrun] 2020-06-14 10:33:02,619 starting run tasks generate classification data uid=8f2102b308f446f28242c03ac1a835a7  -> http://mlrun-api:8080
[mlrun] 2020-06-14 10:33:02,723 Job is running in the background, pod: tasks-generate-classification-data-wjdsf
[mlrun] 2020-06-14 10:33:08,285 starting local run: main.py # gen_class_data
[mlrun] 2020-06-14 10:33:08,806 log artifact classifier-data at /User/artifacts/classifier-data.csv, size: 998700, db: Y

[mlrun] 2020-06-14 10:33:08,823 run executed, status=completed
final state: succeeded
project uid iter start state name labels inputs parameters results artifacts
default 0 Jun 14 10:33:08 completed tasks generate classification data
v3io_user=admin
kind=job
owner=admin
host=tasks-generate-classification-data-wjdsf
n_samples=10000
m_features=5
k_classes=2
weight=[0.5, 0.5]
sk_params={'n_informative': 2}
file_ext=csv
classifier-data
to track results use .show() or .logs() or in CLI: 
!mlrun get run 8f2102b308f446f28242c03ac1a835a7 --project default , !mlrun logs 8f2102b308f446f28242c03ac1a835a7 --project default
[mlrun] 2020-06-14 10:33:11,884 run executed, status=completed