<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">"""
Aligned pipeline
=============================

In this example we will demonstrate use of `AlignedUMAP` with pipeline API.

"""

###############################################################################
# Create Aligned Dataset
# ------------------------------------------

###############################################################################
# For using aligned UMAP, we need to use `cartodata.pipeline.datasets.CSVSliceDataset`. `CSVSliceDataset` divides the specified dataset into `slice_count` slices. 
#
# Setting the `slice_type` parameter to `cumulative` enables us to make slices cumulative. 
#
# Before slicing, we can sort the dataset using `sort_asc_by` parameter specifying the column name to sort in ascending order.

from cartodata.pipeline.datasets import CSVSliceDataset  # noqa
from pathlib import Path # noqa

ROOT_DIR = Path.cwd().parent
# The directory where files necessary to load dataset columns reside
INPUT_DIR = ROOT_DIR / "datas"
# The directory where the generated dump files will be saved
TOP_DIR = ROOT_DIR / "dumps"

slice_count = 2

dataset = CSVSliceDataset(
    "lisn", input_dir=INPUT_DIR, version="3.0.0", filename="lisn_2000_2022.csv",
    fileurl="https://zenodo.org/record/7323538/files/lisn_2000_2022.csv",
    columns=None, slice_count=slice_count, slice_type="discrete", 
    sort_asc="producedDateY_i", index_col=0
)

###############################################################################
# Now we should define our entities and set the column names corresponding to those entities from the data file. We have 4 entities:
#
# | entity | column name in the file |
# ---------|-------------|
# | articles | en_title_s |
# | authors | authFullName_s |
# | labs | structAcronym_s |
# | words | en_abstract_s, en_title_s, en_keyword_s, en_domainAllCodeLabel_fs |
#
#
# Cartolabe provides 4 types of columns: 
#
#
# - **IdentityColumn**: The entity of this column represents the main entity of the dataset. The column data corresponding to the entity in the file should contain a single value and this value should be unique among column values. There can only be one `IdentityColumn` in the dataset.
# - **CSColumn**: The entity of this column type is related to the main entity, and can contain single or comma separated values.
# - **CorpusColumn**: The entity of this column type is the corpus related to the main entity. This can be a combination of multiple columns in the file. It uses a modified version of CountVectorizer(https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer).
# - **TfidfCorpusColumn**: The entity of this column type is the corpus related to the main entity. This can be a combination of multiple columns in the file or can contain filepath from which to read the text corpus. It uses TfidfVectorizer (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html).
#
#
# In this dataset, **Articles** is our main entity. We will define it as IdentityColumn:

from cartodata.pipeline.columns import IdentityColumn, CSColumn, CorpusColumn  # noqa

articles_column = IdentityColumn(nature="articles", column_name="en_title_s")
authors_column = CSColumn(
    nature="authors", column_name="authFullName_s", filter_min_score=4
)
labs_column = CSColumn(
    nature="labs", column_name="structAcronym_s", filter_min_score=4
)
words_column = CorpusColumn(
    nature="words",
    column_names=["en_abstract_s", "en_title_s",
                  "en_keyword_s", "en_domainAllCodeLabel_fs"],
    stopwords="stopwords.txt", nb_grams=4, min_df=10, max_df=0.05,
    min_word_length=5, normalize=True
)

dataset.set_columns(
    [articles_column, authors_column, labs_column, words_column])

###############################################################################
# As we are going to use AlignedUMAP, we need to create an instance of `cartodata.pipeline.common.AlignedPipeline`.

from cartodata.pipeline.common import AlignedPipeline  # noqa

pipeline = AlignedPipeline(dataset, top_dir=TOP_DIR, input_dir=INPUT_DIR)

###############################################################################
# Creating correspondance matrices for each entity type
# -------------------------------------------------------------------------------
#
# We want to extract matrices that will map the correspondance between the articles and the entities we want to use.
#
# AlignedPipeline has `generate_entity_matrices` function to generate matrices and scores for each entity (nature) specified for the dataset.

matrices_all, scores_all = pipeline.generate_entity_matrices()

###############################################################################
# We can list the sizes of each entity matrix for each slice.

for i in range(dataset.slice_count):
    print(f"############## Slice {i + 1} #################")
    matrices_i = matrices_all[i]

    for nature, matrix in zip(pipeline.natures, matrices_i):
        print(f"{nature}  -------------   {matrix.shape}")

###############################################################################
# Dimension reduction
# ------------------------------
#
# One way to see the matrices that we created is as coordinates in the space of
# all articles. What we want to do is to reduce the dimension of this space to
# make it easier to work with and see.
#
# **LSA projection**
#
# We'll start by using the LSA (Latent Semantic Analysis) technique to reduce the number of rows in our data.

from cartodata.pipeline.projectionnd import LSAProjection  # noqa

num_dim = 100

lsa_projection = LSAProjection(num_dim)
pipeline.set_projection_nd(lsa_projection)

###############################################################################
# Now we can run LSA projection on the matrices.

matrices_nD_all = pipeline.do_projection_nD()

""
for i in range(dataset.slice_count):
    print(f"############## Slice {i + 1} #################")
    matrices_nD = matrices_nD_all[i]

    for nature, matrix in zip(pipeline.natures, matrices_nD):
        print(f"{nature}  -------------   {matrix.shape}")


###############################################################################
# This makes it easier to work with them for clustering or nearest neighbors
# tasks, but we also want to project them on a 2D space to be able to map them.
#
# **Aligned UMAP projection**
#
# The `UMAP &lt;https://github.com/lmcinnes/umap&gt;`_ (Uniform Manifold Approximation
# and Projection) is a dimension reduction technique that can be used for
# visualisation similarly to t-SNE.
#
# We use this algorithm to project matrices of each slice aligned with each other in 2 dimensions.

from cartodata.pipeline.projection2d import AlignedUMAPProjection  # noqa

n_neighbors = 20
min_dists = 0.1


projection_2d = AlignedUMAPProjection(
    n_neighbors=n_neighbors,
    min_dist=min_dists,
    init='random',
    random_state=42,
    n_epochs=200)

pipeline.set_projection_2d(projection_2d)

###############################################################################
# Now we can run AlignedUMAP projection on the LSA matrices.

matrices_2D_all, scores_final = pipeline.do_projection_2D()

""
for i in range(len(matrices_2D_all)):
    print(f"############## Slice {i + 1} #################")
    for j in range(len(matrices_2D_all[i])):
        print(f"{matrices_2D_all[i][j].shape}")

###############################################################################
# Now that we have 2D coordinates for our points, we can try to plot them to
# get a feel of the data's shape.

labels = tuple(pipeline.natures)
colors = ['b', 'r', 'c', 'y', 'm']

for i in range(len(matrices_2D_all)):
    matrices_2D = matrices_2D_all[i]

    pipeline.plot_map(matrices_2D, labels, colors,
                      title=f"Slice {i+1}")

###############################################################################
# The plot above, as we don't have labels for the points, doesn't make much sense
# as is. But we can see that the data shows some clusters which we could try to identify.
#
# Clustering
# ---------------
#
# In order to identify clusters, we use the KMeans clustering technique on the
# articles. We'll also try to label these clusters by selecting the most
# frequent words that appear in each cluster's articles.

from cartodata.pipeline.clustering import KMeansClustering  # noqa

# level of clusters, hl: high level, ml: medium level
cluster_natures = ["hl_clusters", "ml_clusters"]

kmeans_clustering = KMeansClustering(
    n=8, base_factor=3, natures=cluster_natures)

pipeline.set_clustering(kmeans_clustering)
(clus_nD_all, clus_2D_all, clus_scores_all, clus_labels_all, 
 clus_eval_pos_all, clus_eval_neg_all) = pipeline.do_clustering()

###############################################################################
# We will now display high level clusters:

for i in range(len(matrices_2D_all)):

    ml_index = 1
    clus_scores_ml = clus_scores_all[i][ml_index]
    clus_mat_ml = clus_2D_all[i][ml_index]

    fig_title = (
        f"{pipeline.dataset.name} {pipeline.clustering.natures[ml_index]} "
        f"{pipeline.projection_nd.key} slice {i + 1}/{pipeline.dataset.slice_count}"
    )
    matrices_2D = matrices_2D_all[i]

    pipeline.plot_map(matrices_2D, labels, colors,
                      title=fig_title,
                      annotations=clus_scores_ml.index,
                      annotation_mat=clus_mat_ml)

###############################################################################
# Now we will save the plots in the working directory.

pipeline.save_plots()

###############################################################################
# We set image name to view it later:

image_title_parts = pipeline.title_parts_clus("ml_clusters")
image_title_parts.append("s2")

image_name_ml_clusters_3_0_0 = "_".join(image_title_parts) + ".png"
image_name_ml_clusters_3_0_0

###############################################################################
# Nearest neighbors
# ----------------------------
#
# One more thing which could be useful to appreciate the quality of our data
# would be to get each point's nearest neighbors. If our data processing is
# done correctly, we expect the related articles, labs, words and authors to be
# located close to each other.
#
# Finding nearest neighbors is a common task with various algorithms aiming to
# solve it. The `find_neighboring` method uses one of these algorithms to find the
# nearest points of all entities (articles, authors,
# labs, words). It takes an optional weight parameter to tweak
# the distance calculation to select points that have a higher score but are
# maybe a bit farther instead of just selecting the closest neighbors.

from cartodata.pipeline.neighbors import AllNeighbors   # noqa

n_neighbors = 10
weights = [0, 0.5, 0.5, 0, 0]

neighboring = AllNeighbors(n_neighbors=n_neighbors, power_scores=weights)

pipeline.set_neighboring(neighboring)
pipeline.find_neighbors()

###############################################################################
# Export file using exporter
# =======================
#
# We can now export the data.
#
# The exported data will be the points extracted from the dataset corresponding to the entities that we have defined.
#
# In the export file, we will have the following columns for each point:
#
#
# | column | value |
# ---------|-------------|
# | nature |  one of articles, authors, teams, labs, words |
# | label | point's label |
# | score | point's score |
# | rank |  point's rank |
# | x | point's x location on the map |
# | y | point's y location on the map |
# | nn_articles | neighboring articles to this point |
# | nn_labs | neighboring labs to this point |
# | nn_words | neighboring words to this point |
#
# we will call `pipeline.export` function. It will create `export.feather` file and save under `pipeline.working_dir`.

from cartodata.pipeline.exporting import (
    ExportNature, MetadataColumn
) # noqa

ex_author = ExportNature(key="authors", 
                         refs=["labs"])
meta_year_article = MetadataColumn(column="producedDateY_i", as_column="year", 
                                   func="x.astype(str)")

meta_url_article = MetadataColumn(column="halId_s", as_column="url", func="x.fillna('')")

ex_article = ExportNature(key="articles", refs=["labs", "authors"], 
                         add_metadata=[meta_year_article, meta_url_article])

""
pipeline.export(export_natures=[ex_article, ex_author])

###############################################################################
# Let's see the directory structure for the pipeline's working_dir:

for file in pipeline.working_dir.iterdir():
    print(file)

###############################################################################
# Here we have directories s1 and s2 that contains the files for each slice. 
# **reducer.pkl** file generated by serialization of the AlignedUMAP object. All scores files contain the scores for the last, ie. 2nd slice. These are saved to be used if a new version of dataset is to be aligned with this model.
#
# Now we will view the contents of "s2":

for file in (pipeline.working_dir / "s2").iterdir():
    print(file)

###############################################################################
# Let's display the contents of the `export.feather` file under **d2** directory for the 2nd slice of the dataset.

import pandas as pd # noqa

df = pd.read_feather(pipeline.working_dir / "s2" / "export.feather")
df.head()

###############################################################################
# We will store the matrices ad clusters to be used later for an animation to see the evolution of the map.

clus_scores_plt = clus_scores_all
clus_2D_plt = clus_2D_all
matrices_2D_plt = matrices_2D_all

###############################################################################
# Align new version of dataset
# --------------------------------------------
#
# We have sliced version 3.0.0 of LISN dataset into 2 and created 2D projections aligned with each other. It contains data until year 2022 inclusive.
#
#
# Now we will download recent version of LISN data that contains publications of year 2023 as well and we will align it with version 3.0.0 of LISN dataset.

from cartodata.scraping import scrape_hal, process_domain_column  # noqa
from cartodata.command_line import STRUCT_MAP # noqa

filters = {}
struct = "lisn"

yearfrom = 2000
yearto = 2024

file = dataset.input_dir / f"{struct.lower()}_{yearfrom}_{yearto - 1}.csv"

if not file.exists():
    filters['structId_i'] = "(" + STRUCT_MAP[struct] + ")"

    years = range(yearfrom, yearto)

    df = scrape_hal(struct, filters, years, cool_down=2)
    process_domain_column(df)

    df.to_csv(file)

###############################################################################
# At this point we have two options to align with version 3.0.0. 
#
# - We can use current projection_2d instance and update using that.
# - We can initiate a new AlignedUMAP instance and align specifying the previous version.
#
# **Using current AlignedUMAP**
#
# We will create a new dataset for the new dataset file. We will version this as 3.1.0.

slice_count = 1

new_dataset = CSVSliceDataset(
    "lisn", input_dir=INPUT_DIR, filename="lisn_2000_2023.csv", 
    slice_count=slice_count, version="3.1.0", sort_asc="producedDateY_i"
)

new_dataset.set_columns(
    [articles_column, authors_column, labs_column, words_column])

###############################################################################
# We will update pipeline's dataset with the new one.

pipeline.update_dataset(new_dataset)

###############################################################################
# We will generate entity matrices for the new dataset.

matrices_all, scores_all = pipeline.generate_entity_matrices()

###############################################################################
# We will do LSA projection.

from cartodata.pipeline.projectionnd import LSAProjection  # noqa

num_dim = 100

lsa_projection = LSAProjection(num_dim)

pipeline.set_projection_nd(lsa_projection)

matrices_nD_all = pipeline.do_projection_nD()

""
matrices_nD = matrices_nD_all[0]

for nature, matrix in zip(pipeline.natures, matrices_nD):
    print(f"{nature}  -------------   {matrix.shape}")

###############################################################################
# Now we will run aligned UMAP to align the current version 3.1.0 of dataset with the current reducer.

matrices_2D_all, scores_final = pipeline.do_projection_2D(prev_version="3.0.0")

""
for i in range(len(matrices_2D_all[0])):
    print(f"{matrices_2D_all[0][j].shape}")

""
labels = tuple(pipeline.natures)

matrices_2D = matrices_2D_all[0]

pipeline.plot_map(matrices_2D, labels, colors,
                      title="New Dataset")

""
(clus_nD_all, clus_2D_all, clus_scores_all, clus_labels_all,
 clus_eval_pos_all, clus_eval_neg_all) = pipeline.do_clustering()

""
ml_index = 1
clus_scores_ml = clus_scores_all[0][ml_index]
clus_mat_ml = clus_2D_all[0][ml_index]

fig_title = (
    f"{pipeline.dataset.name} {pipeline.clustering.natures[ml_index]} "
    f"{pipeline.projection_nd.key} New Dataset"
)
matrices_2D = matrices_2D_all[0]

pipeline.plot_map(matrices_2D, labels, colors,
                      title=fig_title,
                      annotations=clus_scores_ml.index,
                      annotation_mat=clus_mat_ml)

""
pipeline.save_plots()

""
clus_scores_plt += clus_scores_all
clus_2D_plt += clus_2D_all
matrices_2D_plt += matrices_2D_all

###############################################################################
# Let's view the dataset version 3.1.0 with the final slice of the dataset version 3.0.0.

import matplotlib.pyplot as plt  # noqa

image_title_parts = pipeline.title_parts_clus("ml_clusters")
image_title_parts.append("s1")

image_name_ml_clusters_3_1_0 = "_".join(image_title_parts) + ".png"

img1 = plt.imread(pipeline.working_dir.parent / "3.0.0" / "s2" / image_name_ml_clusters_3_0_0)
img2 = plt.imread(pipeline.working_dir / "s1" / image_name_ml_clusters_3_1_0)

f, ax = plt.subplots(2, 1, figsize=(9, 12))

ax[0].imshow(img1)
ax[1].imshow(img2)

ax[0].axis('off')
ax[1].axis('off')

plt.tight_layout()
plt.show()

###############################################################################
# **Using new AlignedUmap instance**
#
# Now we will follow the alternative flow and create a new instance of AlignedUMAPProjection and load the reducer from version 3.0.0 by reading the `reducer.pkl` file.
#
# First we will create new dataset, with version 3.2.0 with the same data.

new_dataset = CSVSliceDataset(
    "lisn", input_dir=INPUT_DIR, filename="lisn_2000_2023.csv", 
    slice_count=slice_count, version="3.2.0", sort_asc="producedDateY_i"
)

new_dataset.set_columns(
    [articles_column, authors_column, labs_column, words_column])

pipeline.update_dataset(new_dataset)

""
n_neighbors = 20
min_dists = 0.1

projection_2d = AlignedUMAPProjection(
    n_neighbors=n_neighbors,
    min_dist=min_dists,
    init='random',
    random_state=42,
    n_epochs=200)

pipeline.set_projection_2d(projection_2d)

""
matrices_all, scores_all = pipeline.generate_entity_matrices()

""
from cartodata.pipeline.projectionnd import LSAProjection  # noqa

num_dim = 100

lsa_projection = LSAProjection(num_dim)

pipeline.set_projection_nd(lsa_projection)

matrices_nD_all = pipeline.do_projection_nD()

""
matrices_2D_all, scores_final = pipeline.do_projection_2D(prev_version="3.0.0")

""
for i in range(len(matrices_2D_all[0])):
    print(f"{matrices_2D_all[0][j].shape}")

""
labels = tuple(pipeline.natures)

matrices_2D = matrices_2D_all[0]

pipeline.plot_map(matrices_2D, labels, colors, title="New Dataset")

""
(clus_nD_all, clus_2D_all, clus_scores_all, clus_labels_all,
 clus_eval_pos_all, clus_eval_neg_all) = pipeline.do_clustering()

""
ml_index = 1
clus_scores_ml = clus_scores_all[0][ml_index]
clus_mat_ml = clus_2D_all[0][ml_index]

fig_title = (
    f"{pipeline.dataset.name} {pipeline.clustering.natures[ml_index]} "
    f"{pipeline.projection_nd.key} New Dataset"
)
matrices_2D = matrices_2D_all[0]

pipeline.plot_map(matrices_2D, labels, colors, title=fig_title,
                      annotations=clus_scores_ml.index,
                      annotation_mat=clus_mat_ml)

""
pipeline.save_plots()

""
import matplotlib.pyplot as plt  # noqa

image_title_parts = pipeline.title_parts_clus("ml_clusters")
image_title_parts.append("s1")

image_name_ml_clusters_3_2_0 = "_".join(image_title_parts) + ".png"

img1 = plt.imread(pipeline.working_dir.parent / "3.0.0" / "s2" / image_name_ml_clusters_3_0_0)
img2 = plt.imread(pipeline.working_dir.parent/ "3.1.0" / "s1" / image_name_ml_clusters_3_1_0)
img3 = plt.imread(pipeline.working_dir / "s1" / image_name_ml_clusters_3_2_0)

f, ax = plt.subplots(3, 1, figsize=(20, 15))

ax[0].imshow(img1)
ax[1].imshow(img2)
ax[2].imshow(img3)

ax[0].axis('off')
ax[1].axis('off')
ax[2].axis('off')

plt.tight_layout()
plt.show()

""
clus_scores_plt += clus_scores_all
clus_2D_plt += clus_2D_all
matrices_2D_plt += matrices_2D_all

""
from matplotlib import animation
import matplotlib.patheffects as pe # noqa

fig = plt.figure(figsize=[10,8], frameon=False, dpi=90)
ax = plt.gca()

def plot_map(matrices, labels, title=None,
             annotations=None, annotation_mat=None, annotation_color='black'):

    ax.clear()
    axes = []

    for i, m in enumerate(matrices):
        axes.append(ax.scatter(m[0, :], m[1, :],
                               color=colors[i],
                               label=labels[i]))

    # set title
    if title is not None:
        ax.set_title(title)

    # set legend
    ax.legend(tuple(axes), labels, fancybox=True, shadow=True)

    if annotations is not None and annotation_mat is not None:
        for i in range(len(annotations)):
            ax.annotate(annotations[i],
                        (annotation_mat[0, i], annotation_mat[1, i]),
                        color=annotation_color,
                        path_effects=[
                            pe.withStroke(linewidth=4, foreground="white")
                        ])

    return axes

def animate(i):
    ml_index = 1
    clus_scores_ml = clus_scores_plt[i][ml_index]
    clus_mat_ml = clus_2D_plt[i][ml_index]

    fig_title = (
        f"{pipeline.dataset.name} {pipeline.clustering.natures[ml_index]} "
        f"{pipeline.projection_nd.key} map {i + 1}"
    )
    matrices_2D = matrices_2D_plt[i]

    return plot_map(matrices_2D, labels, title=fig_title,
                      annotations=clus_scores_ml.index,
                      annotation_mat=clus_mat_ml)

anim = animation.FuncAnimation(fig, animate, repeat = True,
                               frames=range(0,len(matrices_2D_plt)), interval=3000, 
                               blit=True, repeat_delay=3000)

plt.show()

""
anim.save("anim.gif", writer="imagemagick",fps=1, bitrate=100, dpi=80)

""
from IPython.display import Image
Image(filename='anim.gif')
</pre></body></html>