<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">"""
Extracting and processing VisPubData data for Cartolabe 
=====================================================

In this example we will:

- extract entities (authors, articles, labs, words) from a collection of
  scientific articles
- project those entities in 2 dimensions
- cluster them
- find their nearest neighbors.

"""

###############################################################################
# Download data
# =============
#
# We will first download the CSV file that contains all the scientific articles published to the `VIS
# conference &lt;https://ieeevis.org/&gt;`_ between 1990 and 2019. The CSV is
# maintained on the `VisPubData &lt;http://vispubdata.org&gt;`_ web site.

from download import download

csv_url = "https://zenodo.org/record/7569091/files/vispubdata_1990_2019.csv"

download(csv_url, "../datas/vispubdata_1990_2019.csv", kind='file',
                         progressbar=True, replace=False)

""
import pandas as pd   # noqa

df = pd.read_csv('../datas/vispubdata_1990_2019.csv')
df.dropna(subset=['Abstract'], inplace=True)

df.head()

###############################################################################
# The dataset contains 3028 articles.

print(df.shape[0])

###############################################################################
# Let's list the columns:

print(*df.columns, sep="\n")

###############################################################################
# Creating correspondance matrices for each entity type
# ===========================================================
#
#
# From this table of articles, we want to extract matrices that will map the
# correspondance between these articles and the entities we want to use.
#
# Authors
# ------------
#
# Let's start with the authors for example. We want to create a matrix where
# the rows represent the articles and the columns represent the authors. Each
# cell (n, m) will have a 1 in it if the *nth* article was written by the *mth*
# author.

from cartodata.loading import load_comma_separated_column   # noqa

authors_mat, authors_scores = load_comma_separated_column(df,
                                                          'AuthorNames-Deduped',
                                                          comma=';')

###############################################################################
# The `load_comma_separated_column` function takes in a dataframe and the name
# of a column and returns two objects:
#
# - a sparse matrix
# - a pandas `Series`
#
# Each column of the sparce matrix `authors_mat`, corresponds to an author and
# each row corresponds to an article. We see that there are 5405 distict
# authors for 3028 articles.

authors_mat.shape

###############################################################################
# The series, which we named `authors_scores`, contains the list of authors
# extracted from the column `AuthorNames-Deduped` with a score that is equal to
# the number of rows (articles) that this value was mapped within the
# `authors_mat` matrix.

authors_scores.head()

###############################################################################
# If we look at the *2nd* column of the matrix, which corresponds to the author
# **Lambertus Hesselink** we can see that it has 9 non-zero rows, each row
# indicating which articles he authored.

print(authors_mat[:, 1])

###############################################################################
# Filtering low score entities
# --------------------------------------
#
# A lot of the authors and labs that we just extracted from the dataframe have
# a very low score, which means they're only linked to one or two articles. To
# improve the quality of our data, we'll filter the authors and labs by
# removing those that appear less than 2 times.
#
# To do this, we'll use the `filter_min_score` function.

from cartodata.operations import filter_min_score   # noqa

authors_before = len(authors_scores)
authors_mat, authors_scores = filter_min_score(authors_mat, authors_scores, 2)

print(f"Removed {authors_before - len(authors_scores)} authors with less than 2 "
      f"articles from a total of {authors_before} authors.")
print(f"Working with {len(authors_scores)} authors.\n")

###############################################################################
# Words
# ------------
#
# For the words, it's a bit trickier because we want to extract n-grams (groups
# of n terms) instead of just comma separated values. We'll call the
# `load_text_column` which uses scikit-learn's
# `CountVectorizer &lt;https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html&gt;`_
# to create a vocabulary and map the tokens.

from cartodata.loading import load_text_column   # noqa
from sklearn.feature_extraction import text as sktxt   # noqa

with open('../datas/stopwords.txt', 'r') as stop_file:
    stopwords = sktxt.ENGLISH_STOP_WORDS.union(
        set(stop_file.read().splitlines()))

df['text'] = df['Abstract'] + ' ' \
    + df['AuthorKeywords'].astype(str) + ' ' \
    + df['Title']  # maybe add more

words_mat, words_scores = load_text_column(df['text'],
                                           4,
                                           10,
                                           0.05,
                                           stopwords=stopwords)

###############################################################################
# Here `words_scores` contains a list of all the n-grams extracted from the
# documents with their score,

words_scores.head()

###############################################################################
# and the `words_mat` matrix counts the occurences of each of the 3387 n-grams
# for all the articles.

words_mat.shape

###############################################################################
# To get a better representation of the importance of each term, we'll also
# apply a TF-IDF (term-frequency times inverse document-frequency)
# normalization on the matrix.
#
# The `normalize_tfidf` simply calls scikit-learn's
# `TfidfTransformer &lt;https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer&gt;`_
# class.

from cartodata.operations import normalize_tfidf   # noqa

words_mat = normalize_tfidf(words_mat)

###############################################################################
# Articles
# -------------
#
# Finally, we need to create a matrix that simply maps each article to itself.

df['XPloreCitationCount_02-2019']

""
from cartodata.loading import load_identity_column   # noqa
import scipy.sparse as scs   # noqa

# articles_mat, articles_scores = load_identity_column(df, 'Title')
titles = df['Title']
articles_mat = scs.identity(len(titles))
articles_scores = pd.Series(df['XPloreCitationCount_02-2019'].values + 1,
                            index=titles)

articles_scores.head()

###############################################################################
# Dimension reduction
# ===================
#
# One way to see the matrices that we created is as coordinates in the space of
# all articles. What we want to do is to reduce the dimension of this space to
# make it easier to work with and see.
#
# LSA projection
# ------------------------
#
# We'll start by using the LSA (Latent Semantic Analysis) technique to identify
# keywords in our data and thus reduce the number of rows in our matrices. The
# `lsa_projection` method takes three arguments:
#
# - the number of dimensions you want to keep
# - the matrix of documents/words frequency
# - a list of matrices to project
#
# It returns a list of the same length containing the matrices projected in the
# latent space.
#
# We also apply an l2 normalization to each feature of the projected matrices.

from cartodata.projection import lsa_projection   # noqa
from cartodata.operations import normalize_l2   # noqa

lsa_matrices = lsa_projection(80,
                              words_mat,
                              [articles_mat, authors_mat, words_mat])
lsa_matrices = list(map(normalize_l2, lsa_matrices))

###############################################################################
# We've reduced the number of rows in each of `articles_mat`, `authors_mat`,
# `words_mat` and `labs_mat` to just 80.

print(f"articles_mat: {lsa_matrices[0].shape}")
print(f"authors_mat: {lsa_matrices[1].shape}")
print(f"words_mat: {lsa_matrices[2].shape}")

###############################################################################
# This makes it easier to work with them for clustering or nearest neighbors
# tasks, but we also want to project them on a 2D space to be able to map them.
#
# UMAP projection
# -------------------------
#
# The `UMAP &lt;https://github.com/lmcinnes/umap&gt;`_ (Uniform Manifold Approximation
# and Projection) is a dimension reduction technique that can be used for
# visualisation similarly to t-SNE.
#
# We use this algorithm to project our matrices in 2 dimensions.

from cartodata.projection import umap_projection   # noqa

umap_matrices = umap_projection(lsa_matrices)

###############################################################################
# Now that we have 2D coordinates for our points, we can try to plot them to
# get a feel of the data's shape.

import matplotlib.pyplot as plt  # noqa
import numpy as np  # noqa
import seaborn as sns  # noqa
# %matplotlib inline

sns.set(style='white', rc={'figure.figsize': (12, 8)})

labels = ('article', "auth", "words")
colors = ['g', 'r', 'b']
markers = ['x', 's', '+']

def plot(matrices):
    plt.close('all')
    fig, ax = plt.subplots()

    axes = []

    for i, m in enumerate(matrices):
        axes.append(ax.scatter(m[0, :], m[1, :], 
                               color=colors[i], marker=markers[i],
                               label = labels[i]))
                           
    
    leg = ax.legend((axes[0], axes[1], axes[2]), 
                    labels, 
                    fancybox=True, shadow=True)
    
    return fig, ax

fig, ax = plot(umap_matrices)

###############################################################################
# On the plot above, articles are shown in green, authors in red and words in blue. 
# Because we don't have labels for the points, it doesn't make much sense as is. 
# But we can see that the data shows some clusters which we could try to identify.
#
# Clustering
# ==========
#
# In order to identify clusters, we use the KMeans clustering technique on the
# articles. We'll also try to label these clusters by selecting the most
# frequent words that appear in each cluster's articles.

from cartodata.clustering import create_kmeans_clusters  # noqa

cluster_labels = []
c_lda, c_umap, c_scores, c_knn, _, _, _ = create_kmeans_clusters(8,  # number of clusters to create
                                                        # 2D matrix of articles
                                                        umap_matrices[0],
                                                        # the 2D matrix of words
                                                        umap_matrices[2],
                                                        # the articles to words matrix
                                                        words_mat,
                                                        # word scores
                                                        words_scores,
                                                        # a list of initial cluster labels
                                                        cluster_labels,
                                                        # LDA space matrix of words
                                                        lsa_matrices[2])

c_scores

""
fig, ax = plot(umap_matrices)

for i in range(8):
    ax.annotate(c_scores.index[i], (c_umap[0, i], c_umap[1, i]), 
                color='red')

###############################################################################
# The 8 clusters that we created give us a general idea of what the big
# clusters of data contain. But we'll probably want a finer level of detail if
# we start to zoom in and focus on smaller areas. So we'll also create a second
# bigger group of clusters. To do this, simply increase the number of clusters
# we want.

mc_lsa, mc_umap, mc_scores, mc_knn, _, _, _ = create_kmeans_clusters(32,
                                                            umap_matrices[0],
                                                            umap_matrices[2],
                                                            words_mat,
                                                            words_scores,
                                                            cluster_labels,
                                                            lsa_matrices[2])

mc_scores

###############################################################################
# Nearest neighbors
# -----------------------------
#
# One more thing which could be useful to appreciate the quality of our data
# would be to get each point's nearest neighbors. If our data processing is
# done correctly, we expect the related articles, labs, words and authors to be
# located close to each other.
#
# Finding nearest neighbors is a common task with various algorithms aiming to
# solve it. The `get_neighbors` method uses one of these algorithms to find the
# nearest points of each type. It takes an optional weight parameter to tweak
# the distance calculation to select points that have a higher score but are
# maybe a bit farther instead of just selecting the closest neighbors.
#
# Because we want to find the neighbors of each type (articles, authors, words,
# labs) for all of the entities, we call the `get_neighbors` method in a loop
# and store its results in an array.

from cartodata.neighbors import get_neighbors   # noqa

scores = [articles_scores, authors_scores, words_scores]
weights = [0, 0.5, 0.5, 0]
all_neighbors = []

for idx in range(len(lsa_matrices)):
    all_neighbors.append(get_neighbors(lsa_matrices[idx],
                                       scores[idx],
                                       lsa_matrices,
                                       weights[idx]))

###############################################################################
# Exporting
# -----------------
#
# We now have sufficient data to create a meaningfull visualization.

###############################################################################
# We can now export the data. We will first create the exporter.

from cartodata.operations import export_to_json  # noqa

natures = ['articles',
           'authors',
           'words',
           'hl_clusters',
           'ml_clusters'
           ]
export_file = '../datas/vispubdata_lsa.json'

# add the clusters to list of 2d matrices and scores
matrices = list(umap_matrices)
matrices.extend([c_umap, mc_umap])
scores.extend([c_scores, mc_scores])

# Create a json export file with all the infos
export_to_json(natures,
               matrices,
               scores,
               export_file,
               neighbors_natures=natures[:3],
               neighbors=all_neighbors)

###############################################################################
# This creates the `visupubdata_lsa.json` file which contains a list of points
# ready to be imported into Cartolabe. Have a look at it to check that it
# contains everything.

import json  # noqa

with open(export_file, 'r') as f:
    data = json.load(f)

data[1]['position']
</pre></body></html>