<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">"""
Running an Experiment on a dataset using pipeline API
=====================================================

This notebook demonstrates running an experiment to search for hyperparameters and saving the scores of the experiment for a specified set of parameters using `cartodata.pipeline.PipelineExperiment` for `lisn` dataset. 

First we will define necessary global variables.
"""

from pathlib import Path # noqa

ROOT_DIR = Path.cwd().parent

SOURCE = "authors"
NATURE = "articles"

# The directory where the artifacts of the experiment will be saved
TOP_DIR = ROOT_DIR / "experiment_pipeline_lisn"
# The directory where dataset.yaml files reside
CONF_DIR = ROOT_DIR / "conf"
# The directory where files necessary to load dataset columns reside
INPUT_DIR = ROOT_DIR / "datas"

TOP_DIR

###############################################################################
# Initialize Parameter Iterator
# -----------------------------
#
# We will initilialize a parameter iterator to iterate through our parameters. We have two options `GridIterator` and `RandomIterator`.

from cartodata.model_selection.iterator import GridIterator, RandomIterator # noqa

help(GridIterator)

""
help(RandomIterator)

###############################################################################
# We define the set of parameters that we want to test. Phase parameters should be specified using phase name and list of dictionaries for phase parameters.

from cartodata.phases import PhaseProjectionND, PhaseProjection2D, PhaseClustering

params = {
    "robustseed" : [0], 
    "authors__filter_min_score": [4], 
    "filter_min_score": [6], 
    "words__column_names": [["en_keyword_s", "en_domainAllCodeLabel_fs",
                             "en_abstract_s", "en_title_s"
                            ], ["en_abstract_s", "en_title_s",
                                "en_keyword_s", "en_domainAllCodeLabel_fs",
                            ]],
    PhaseProjectionND.NAME : [
        { "key": ["bert"], "family": ["all-MiniLM-L6-v2", "specter2"], "max_length": [256, 512] },
        { "key": ["lsa"], "num_dims": [50, 100, 200] }
    ],
    PhaseProjection2D.NAME : [
        { "key": ["umap"], "n_neighbors" : [10, 20, 50], "min_dist" : [0.1, 0.25, 0.5],  
         "metric" : ["euclidean"] },
        { "key": ["tsne"], "perplexity" : [30, 50] }
    ]
}

""
param_iterator = GridIterator(params_dict=params)

""
param_iterator.params_frame.shape

""
param_iterator.params_frame

###############################################################################
# Run Experiment
# --------------------------------------
#
#
# We will run the expriment using `cartodata.pipeline.PipelineExperiment`.

from cartodata.pipeline.experiment import PipelineExperiment  # noqa

help(PipelineExperiment)

###############################################################################
# All possible scoring classes are in the `cartodata.model_selection.scoring` module. We will run scoring for each, so we will import them all.

from cartodata.model_selection.scoring import (
    NeighborsND, Neighbors2D, Comparative, TrustworthinessSklearn, TrustworthinessUmap,
    Clustering, FinalScore
)

###############################################################################
# We need to specify which scores to calculate to the experiment using the parameter `score_list`. If we do not specify, the experiment will evaluate scores for all available scoring classes.
#
# We will do it explicitly and specify all classes defined in `cartodata.model_selection.scoring` module.
#
# It is possible to specify parameters for score classes as keyword arguments.
#
# For example, if `cartodata.model_selection.scoring.FinalScore` is specified in the `score_list`, the experiment calculates an aggregated score taking average of all scores at the end of each run. If instead of all scores, we want a subset of scores to be included in the average, we can specify it using `final_score__name_list`. For each scoring class, we should name the parameter in the format `scoring_KEY__scoring_parameter`.
#

experiment = PipelineExperiment(
    "lisn", "2022.11.15.1", TOP_DIR, CONF_DIR, INPUT_DIR,
    NATURE, SOURCE, param_iterator,
    score_list=[NeighborsND,
                Neighbors2D,
                Comparative,
                TrustworthinessSklearn,
#                TrustworthinessUmap,
                Clustering,
                FinalScore],
    final_score__name_list=[
        PhaseProjectionND.prefix("neighbors_articles_authors"), 
        PhaseProjection2D.prefix("neighbors_articles_authors"), 
        PhaseClustering.prefix("clu_score")
    ],   
    neighbors__recompute=True, 
    neighbors__min_score=30, 
    trustworthiness_sklearn__n_neighbors=10
)

###############################################################################
# Now we will run the experiment for 3 different set of parameters.

results = experiment.run(3)

###############################################################################
# When the experiment is run, results of all runs is saved in `experiment.results`. We access the values corresponding to each run with `experiment.results.runs_`.

experiment.results.runs_[0].scores

""
list(experiment.results.runs_[0].desc_scores.keys())

""
experiment.results.runs_[0].desc_scores

""
list(experiment.results.runs_[0].raw_scores.keys())

""
experiment.results.runs_[0].raw_scores

""
experiment.results.print_best(n=20)

###############################################################################
# Let's see some of the results of the experiment from the file system. 
#
# We will first check the contents of the `scores` directory.

# !ls $TOP_DIR/scores

###############################################################################
# `6_pst__final_scores.csv` file contains the final scores for each set of parameters together with the parameter values. We had 3 runs, so there are 3 values.

# !cat $TOP_DIR/scores/6_pst__final_score.csv

###############################################################################
# The `final_results.csv` displays each score calculated during the experiment in separate columns together with `rank` and an aggregated score `agscore`. This value is the same as the value in `6_post__final_score.csv` file.

# !cat $TOP_DIR/scores/final_results.csv

###############################################################################
# The other files in the directory contains the single score calculated for all runs. For example `2_nD__neighbors_articles_authors.csv` file contains the `2_nD__neighbors_articles_authors` scores for 3 runs.

# !cat $TOP_DIR/scores/2_nD__neighbors_articles_authors.csv

###############################################################################
# The files that contain `det` in theirs names, contains the neighbors and their scores used to calculate the score `2_nD__neighbors_articles_authors` for each run.

# !cat $TOP_DIR/scores/2_nD__neighbors_articles_authors_det.csv

###############################################################################
# These files also reside in the hierarchical dataset directories generated during the run.
#
# For example `experiment_pipeline/lisn/2022.11.15.1/0/mat_articles__authors_4_teams_4_labs_4_words_10_0.05_None_None_5_4/bert_768_True_all-MiniLM-L6-v2_256/scores_2_nD__neighbors_articles_authors_det.csv` file, but only for the specific run  together with hyperparameters and scorşng parameters.

# !cat $TOP_DIR/lisn/2022.11.15.1/0/mat_articles__authors_4_teams_4_labs_4_words_10_0.05_None_None_5_4/bert_768_True_all-MiniLM-L6-v2_256/scores_2_nD__neighbors_articles_authors_det.csv

###############################################################################
# Actually for each set of parameters, the estimator generates a directory structure of the form:
#
# `top_dir / dataset / dataset_version / robustseed / dataset_column_parameters / projection_nd_key_dim / projection2D_key_n_neighbors_min_dist_metric_init_learning_rate_repulsion_strength / clustering_key_base_factor`.
#
# Each score calculated at a certain level in the directory structure is saved in that directory.

###############################################################################
# Now, we will continue the experiment to run for 3 more set of parameters.

experiment.run(3)

###############################################################################
# We can see that we have run the first 6 parameter sets in the dataframe.

len(experiment.results.runs_)
</pre></body></html>