<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">"""
Extracting and processing VisPubdata data with the Cartolabe API
==================================================================

Comparing the quality of embeddings using multiple methods
-------------------------------------------------------------
"""

# %matplotlib inline
# %load_ext autoreload
# %autoreload 2
# %matplotlib widget

###############################################################################
# Download data
# ================
#
# We will start by downloading the VisPubData dataset from Google Spreadsheet.
# See Petra Isenberg, Florian Heimerl, Steffen Koch, Tobias Isenberg, Panpan Xu, et al.. vispubdata.org: A Metadata Collection about IEEE Visualization (VIS) Publications. IEEE Transactions on Visualization and Computer Graphics, 2017, 23 (9), pp.2199-2206. ⟨[https://dx.doi.org/10.1109/TVCG.2016.2615308](10.1109/TVCG.2016.2615308)⟩. ⟨[https://dx.doi.org/10.1109/TVCG.2016.2615308](hal-01376597)⟩
#

SHEET_ID = '1xgoOPu28dQSSGPIp_HHQs0uvvcyLNdkMF9XtRajhhxU'
SHEET_NAME = 'Main%20dataset'
url = f'https://docs.google.com/spreadsheets/d/{SHEET_ID}/gviz/tq?tqx=out:csv&amp;sheet={SHEET_NAME}'

min_df = 25
max_df = 0.1
max_words = 100000
vocab_sample = 250000
num_dims = 300
filt_min_score = 3
n_neighbors = 10

""
import pandas as pd   # noqa

df = pd.read_csv(url)
df.AuthorKeywords.fillna('', inplace=True)
df.Abstract.fillna('', inplace=True)
df.AuthorAffiliation.fillna('', inplace=True)
df['text'] = df.Abstract + ' ' \
            + df.AuthorKeywords + ' ' \
            + df.Title
df.head()

###############################################################################
# Creating correspondance matrices for each entity type
# ================================================================
#
# From this table of articles, we want to extract matrices that will map the
# correspondance between these articles and the entities we want to use.

###############################################################################
# Authors
# --------------
#
# Let's start with the authors for example. We want to create a matrix where
# the rows represent the articles and the columns represent the authors. Each
# cell (n, m) will have a 1 in it if the *nth* article was written by the *mth*
# author.
#
# As we have multiple dataframes, the results will be arrays corresponding to specified dataframes.

from cartodata.loading import load_comma_separated_column  # noqa

authors_mat, authors_scores = load_comma_separated_column(df, 'AuthorNames-Deduped', comma=';')
authors_mat.shape

""
authors_scores.head()

###############################################################################
# If we look at the *2nd* column of the matrix, which corresponds to the author
# **Michelle Borkin**, we can see that she has 8 non-zero rows, each row
# indicating which articles she authored.

print(authors_mat[:, 1])

###############################################################################
# Filtering low score entities
# ---------------------------------------
#
# A lot of the authors that we just extracted from the dataframe have
# a very low score, which means they're only linked to one or two articles. To
# improve the quality of our data, we'll filter the authors by
# removing those that appear less than 3 times.
#
# To do this, we'll use the `filter_min_score` function.

from cartodata.operations import filter_min_score  # noqa

authors_before = len(authors_scores)

authors_mat, authors_scores = filter_min_score(authors_mat,
                                               authors_scores,
                                               filt_min_score)

print(f"Removed {authors_before - len(authors_scores)} authors with less "
      f"than 3 articles from a total of {authors_before} authors.")
print(f"Working with {len(authors_scores)} authors.\n")


###############################################################################
# Words
# ------------
#
# For the words, it's a bit trickier because we want to extract n-grams (groups
# of n terms) instead of just comma separated values. We'll call the
# `load_text_column` which uses scikit-learn's
# `CountVectorizer &lt;https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html&gt;`_
# to create a vocabulary and map the tokens.

from cartodata.loading import load_text_column  # noqa
from sklearn.feature_extraction import text as sktxt  # noqa

with open('../datas/stopwords.txt', 'r') as stop_file:
    stopwords = sktxt.ENGLISH_STOP_WORDS.union(
        set(stop_file.read().splitlines()))

words_mat, words_scores = load_text_column(df['text'],
                                           4,
                                           min_df,
                                           max_df,
                                           stopwords=stopwords)

""
words_scores.head()

""
words_mat.shape

""
from cartodata.operations import normalize_tfidf  # noqa

words_mat = normalize_tfidf(words_mat)
words_mat.shape

""
from cartodata.loading import load_identity_column  # noqa

articles_mat, articles_scores = load_identity_column(df, 'Title')
articles_scores.head()

###############################################################################
# Dimension reduction/Embeddings
# ==============================
#
# One way to see the matrices that we created is as coordinates in the space of
# all articles. What we want to do is to reduce the dimension of this space to
# make it easier to work with and see.
#
# Validation
# ----------
#
# We compute a score that counts the average number of times the 10 nearest neighbors of an article are from the same author as the article.
# For each author, we have a number between 1 (100%) and 0.1 (none of the articles are from the same author, except the initial article itself).
#
# LSA projection
# -------------------------
#
# We'll start by using the LSA (Latent Semantic Analysis) technique to identify
# keywords in our data and thus reduce the number of rows in our matrices. The
# `lsa_projection` method takes three arguments:
#
# - the number of dimensions you want to keep
# - the matrix of documents/words frequency
# - a list of matrices to project
#
# It returns a list of the same length containing the matrices projected in the
# latent space.
#
# We also apply an l2 normalization to each feature of the projected matrices.

from cartodata.projection import lsa_projection  # noqa
from cartodata.operations import normalize_l2  # noqa

""
# %%time
lsa_matrices = lsa_projection(num_dims, words_mat, [articles_mat, authors_mat, words_mat])

""
lsa_matrices = list(map(normalize_l2, lsa_matrices))

###############################################################################
# We've reduced the number of rows in each of `articles_mat`, `authors_mat`,
# `words_mat` and `labs_mat` to just 80.

print(f"articles_mat: {lsa_matrices[0].shape}")
print(f"authors_mat: {lsa_matrices[1].shape}")
print(f"words_mat: {lsa_matrices[2].shape}")

""
from cartodata.model_selection.scoring import Neighbors # noqa

NATURE = "articles"
SOURCE = "authors"

lsa_score = Neighbors.evaluate(
    NATURE, SOURCE, authors_mat, authors_scores, dir_xD=".", 
    scores_nature=articles_scores, matrix_nature_xD=lsa_matrices[0], 
    min_score=filt_min_score, n_neighbors=n_neighbors, recompute=True
)
lsa_score.print()

###############################################################################
# LDA projection
# -------------------------

from cartodata.projection import lda_projection  # noqa

""
# %%time
lda_matrices = lda_projection(num_dims, 1, [articles_mat, authors_mat, words_mat])

""
lda_matrices = list(map(normalize_l2, lda_matrices))

""
print(f"articles_mat: {lda_matrices[0].shape}")
print(f"authors_mat: {lda_matrices[1].shape}")
print(f"words_mat: {lda_matrices[2].shape}")

""
lda_score = Neighbors.evaluate(
    NATURE, SOURCE, authors_mat, authors_scores, dir_xD=".", 
    scores_nature=articles_scores, matrix_nature_xD=lda_matrices[0], 
    min_score=filt_min_score, n_neighbors=n_neighbors, recompute=True
)
lda_score.print()

###############################################################################
# DOC2Vec projection
# -------------------------

from cartodata.projection import doc2vec_projection  # noqa

""
# %%time
doc2vec_matrices = doc2vec_projection(num_dims, 1, [articles_mat, authors_mat, words_mat], df['text'])

""
doc2vec_matrices = list(map(normalize_l2, doc2vec_matrices))

""
print(f"articles_mat: {doc2vec_matrices[0].shape}")
print(f"authors_mat: {doc2vec_matrices[1].shape}")
print(f"words_mat: {doc2vec_matrices[2].shape}")

""
doc2vec_score = Neighbors.evaluate(
    NATURE, SOURCE, authors_mat, authors_scores, dir_xD=".", 
    scores_nature=articles_scores, matrix_nature_xD=doc2vec_matrices[0], 
    min_score=filt_min_score, n_neighbors=n_neighbors, recompute=True
)
doc2vec_score.print()

###############################################################################
# Specter2 projection
# -------------------------

from cartodata.projection import bert_projection  # noqa

""
# %%time
specter2_matrices = bert_projection([articles_mat, authors_mat, words_mat], df['text'])

""
specter2_matrices = list(map(normalize_l2, specter2_matrices))

""
print(f"articles_mat: {specter2_matrices[0].shape}")
print(f"authors_mat: {specter2_matrices[1].shape}")
print(f"words_mat: {specter2_matrices[2].shape}")

""
specter2_score = Neighbors.evaluate(
    NATURE, SOURCE, authors_mat, authors_scores, dir_xD=".", 
    scores_nature=articles_scores, matrix_nature_xD=specter2_matrices[0], 
    min_score=filt_min_score, n_neighbors=n_neighbors, recompute=True
)
specter2_score.print()

###############################################################################
# Scincl projection
# -------------------------

# %%time
scincl_matrices = bert_projection([articles_mat, authors_mat, words_mat], df['text'], family="scincl")

""
scincl_matrices = list(map(normalize_l2, scincl_matrices))

""
print(f"articles_mat: {scincl_matrices[0].shape}")
print(f"authors_mat: {scincl_matrices[1].shape}")
print(f"words_mat: {scincl_matrices[2].shape}")

""
scincl_score = Neighbors.evaluate(
    NATURE, SOURCE, authors_mat, authors_scores, dir_xD=".", 
    scores_nature=articles_scores, matrix_nature_xD=scincl_matrices[0], 
    min_score=filt_min_score, n_neighbors=n_neighbors, recompute=True
)
scincl_score.print()

###############################################################################
# "all-MiniLM-L6-v2" projection
# -------------------------

# %%time
minilm_matrices = bert_projection([articles_mat, authors_mat, words_mat], df['text'], family="all-MiniLM-L6-v2")

""
minilm_matrices = list(map(normalize_l2, minilm_matrices))

""
print(f"articles_mat: {minilm_matrices[0].shape}")
print(f"authors_mat: {minilm_matrices[1].shape}")
print(f"words_mat: {minilm_matrices[2].shape}")

###############################################################################
#
#

minilm_score = Neighbors.evaluate(
    NATURE, SOURCE, authors_mat, authors_scores, dir_xD=".", 
    scores_nature=articles_scores, matrix_nature_xD=minilm_matrices[0], 
    min_score=filt_min_score, n_neighbors=n_neighbors, recompute=True
)
minilm_score.print()

###############################################################################
# "all-mpnet-base-v2" projection
# -------------------------

# %%time
mpnet_matrices = bert_projection([articles_mat, authors_mat, words_mat], df['text'], family="all-mpnet-base-v2")

""
mpnet_matrices = list(map(normalize_l2, mpnet_matrices))

""
print(f"articles_mat: {mpnet_matrices[0].shape}")
print(f"authors_mat: {mpnet_matrices[1].shape}")
print(f"words_mat: {mpnet_matrices[2].shape}")

""
mpnet_score = Neighbors.evaluate(
    NATURE, SOURCE, authors_mat, authors_scores, dir_xD=".", 
    scores_nature=articles_scores, matrix_nature_xD=mpnet_matrices[0], 
    min_score=filt_min_score, n_neighbors=n_neighbors, recompute=True
)
mpnet_score.print()
</pre></body></html>