Generate classification data#
Use this function to generate sample data sets, wraps scikit-learn’s make_classification. See the link for a description of all parameters.
# nuclio: ignore
import nuclio
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from typing import Optional, List, Any
from sklearn.datasets import make_classification
from mlrun.execution import MLClientCtx
def gen_class_data(
context: MLClientCtx,
n_samples: int,
m_features: int,
k_classes: int,
header: Optional[List[str]],
label_column: Optional[str] = "labels",
weight: float = 0.5,
random_state: int = 1,
key: str = "classifier-data",
file_ext: str = "parquet",
sk_params = {}
):
"""Create a binary classification sample dataset and save.
If no filename is given it will default to:
"simdata-{n_samples}X{m_features}.parquet".
Additional scikit-learn parameters can be set using **sk_params, please see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html for more details.
:param context: function context
:param n_samples: number of rows/samples
:param m_features: number of cols/features
:param k_classes: number of classes
:param header: header for features array
:param label_column: column name of ground-truth series
:param weight: fraction of sample negative value (ground-truth=0)
:param random_state: rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state)
:param key: key of data in artifact store
:param file_ext: (pqt) extension for parquet file
:param sk_params: additional parameters for `sklearn.datasets.make_classification`
"""
features, labels = make_classification(
n_samples=n_samples,
n_features=m_features,
weights=weight,
n_classes=k_classes,
random_state=random_state,
**sk_params)
# make dataframes, add column names, concatenate (X, y)
X = pd.DataFrame(features)
if not header:
X.columns = ["feat_" + str(x) for x in range(m_features)]
else:
X.columns = header
y = pd.DataFrame(labels, columns=[label_column])
data = pd.concat([X, y], axis=1)
context.log_dataset(key, df=data, format=file_ext, index=False)
# nuclio: end-code
save#
from mlrun import code_to_function
from mlrun.platforms.other import auto_mount
gpus = False
fn_params = {
"name" : "gen_class_data",
"handler" : "gen_class_data",
"kind" : "job",
"image" : "mlrun/ml-models" if not gpus else "mlrun/ml-models-gpu",
"description" : "simulate classification data using scikit-learn",
"categories" : ["simulators", "ml"],
"labels" : {"author": "yjb", 'framework': 'sklearn'},
}
fn = code_to_function(**fn_params)
fn.export("function.yaml")
fn.apply(auto_mount())
[mlrun] 2020-06-14 10:37:07,647 function spec saved to path: function.yaml
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7faf4a975eb8>
test function#
from mlrun import NewTask, mlconf
task_params = {
"name": "tasks generate classification data",
"params" : {
"n_samples" : 10_000,
"m_features" : 5,
"k_classes" : 2,
"weight" : [0.5, 0.5],
"sk_params" : {"n_informative": 2},
"file_ext" : "csv"}}
local#
from mlrun import run_local
run_local(NewTask(**task_params), handler=gen_class_data)
[mlrun] 2020-06-14 10:33:01,963 starting run tasks generate classification data uid=1d7c5af7e4b04bd98755c87842455105 -> http://mlrun-api:8080
[mlrun] 2020-06-14 10:33:02,156 log artifact classifier-data at /User/artifacts/classifier-data.csv, size: 998700, db: Y
project | uid | iter | start | state | name | labels | inputs | parameters | results | artifacts |
---|---|---|---|---|---|---|---|---|---|---|
default | 0 | Jun 14 10:33:01 | completed | tasks generate classification data | v3io_user=admin kind=handler owner=admin host=jupyter-7b44c8d958-kklf7 |
n_samples=10000 m_features=5 k_classes=2 weight=[0.5, 0.5] sk_params={'n_informative': 2} file_ext=csv |
classifier-data |
to track results use .show() or .logs() or in CLI:
!mlrun get run 1d7c5af7e4b04bd98755c87842455105 --project default , !mlrun logs 1d7c5af7e4b04bd98755c87842455105 --project default
[mlrun] 2020-06-14 10:33:02,198 run executed, status=completed
<mlrun.model.RunObject at 0x7fafa49fc160>
remote#
run = fn.run(NewTask(**task_params), artifact_path=mlconf.artifact_path)
[mlrun] 2020-06-14 10:33:02,619 starting run tasks generate classification data uid=8f2102b308f446f28242c03ac1a835a7 -> http://mlrun-api:8080
[mlrun] 2020-06-14 10:33:02,723 Job is running in the background, pod: tasks-generate-classification-data-wjdsf
[mlrun] 2020-06-14 10:33:08,285 starting local run: main.py # gen_class_data
[mlrun] 2020-06-14 10:33:08,806 log artifact classifier-data at /User/artifacts/classifier-data.csv, size: 998700, db: Y
[mlrun] 2020-06-14 10:33:08,823 run executed, status=completed
final state: succeeded
project | uid | iter | start | state | name | labels | inputs | parameters | results | artifacts |
---|---|---|---|---|---|---|---|---|---|---|
default | 0 | Jun 14 10:33:08 | completed | tasks generate classification data | v3io_user=admin kind=job owner=admin host=tasks-generate-classification-data-wjdsf |
n_samples=10000 m_features=5 k_classes=2 weight=[0.5, 0.5] sk_params={'n_informative': 2} file_ext=csv |
classifier-data |
to track results use .show() or .logs() or in CLI:
!mlrun get run 8f2102b308f446f28242c03ac1a835a7 --project default , !mlrun logs 8f2102b308f446f28242c03ac1a835a7 --project default
[mlrun] 2020-06-14 10:33:11,884 run executed, status=completed