Python pour la data science

Lino Galiana

doi:10.5281/zenodo.8229676

path = window.location.pathname.replace(".html", ".qmd");
path_modified = (path.includes('en/content')) ? path.replace('en/content', 'content/en') : path
html`${printBadges({fpath: path_modified})}`

html`<div>${getConditionalHTML(path, true)}</div>`

function isBetweenSeptAndDec() {
    const now = new Date();
    const start = new Date(now.getFullYear(), 8, 1); // September 1st (month is zero-indexed)
    const end = new Date(now.getFullYear(), 11, 24); // December 24th (month is zero-indexed)
    
    return now >= start && now <= end;
}

function getConditionalHTML(path, print) {
    if (print === false) return ``
    if (isBetweenSeptAndDec()) {
        return md`<i>La correction sera visible prochainement sur cette page. En attendant, la liste des corrections déjà acccessibles est [ici](/content/annexes/corrections.html)</i>`; // Return an empty string if not between the dates
    } else {
        return html`
        <details>
            <summary>
                Pour ouvrir la version corrigée sous forme de <i>notebook</i>
            </summary>
            ${printBadges({ fpath: path, correction: true })}
        </details>
        `;
    }
}

function renderCorrection({ fpath, correction }) {
    if (correction) {
        return html`${printBadges({ fpath: fpath, correction: true })}`;
    } else {
        return html`<i>La correction sera visible prochainement sur cette page.</i>`;
    }
}

function reminderBadges({
    sourceFile = "content/01_toto.Rmd",
    type = ['md', 'html'],
    split = null,
    onyxiaOnly = false,
    sspCloudService = "python",
    GPU = false,
    correction = false
} = {}) {
    if (Array.isArray(type)) {
        type = type[0];
    }

    let notebook = sourceFile.replace(/(.Rmd|.qmd)/, ".ipynb");
    if (correction) {
        notebook = notebook.replace(/content/, "corrections");
    } else {
        notebook = notebook.replace(/content/, "notebooks");
    }

    const githubRepoNotebooksSimplified = "github/linogaliana/python-datascientist-notebooks";
    const githubAlias = githubRepoNotebooksSimplified.replace("github", "github.com");
    const githubRepoNotebooks = `https://${githubAlias}`;

    let githubLink ;

    if (notebook === "") {
        githubLink = githubRepoNotebooks;
    } else {
        githubLink = `${githubRepoNotebooks}/blob/main`;
    }

    const notebookRelPath = `/${notebook}`;
    const [section, chapter] = notebook.split("/").slice(-2);

    githubLink = `<a href="${githubLink}${notebookRelPath}" class="github"><i class="fab fa-github"></i></a>`;

    let sectionLatest = section.split("/").slice(-1)[0];
    const chapterNoExtension = chapter.replace(".ipynb", "");
    
    if (sourceFile.includes('en/')) {
        sectionLatest = `en/${sectionLatest}`;
    }

    const onyxiaInitArgs = [sectionLatest, chapterNoExtension];

    if (correction) {
        onyxiaInitArgs.push("correction");
    }

    const gpuSuffix = GPU ? "-gpu" : "";

    const sspcloudJupyterLinkLauncher = `https://datalab.sspcloud.fr/launcher/ide/jupyter-${sspCloudService}${gpuSuffix}?autoLaunch=true&onyxia.friendlyName=%C2%AB${chapterNoExtension}%C2%BB&init.personalInit=%C2%ABhttps%3A%2F%2Fraw.githubusercontent.com%2Flinogaliana%2Fpython-datascientist%2Fmaster%2Fsspcloud%2Finit-jupyter.sh%C2%BB&init.personalInitArgs=%C2%AB${onyxiaInitArgs.join('%20')}%C2%BB`;

    let sspcloudJupyterLink;
    if (type === "md") {
        sspcloudJupyterLink = `[![Onyxia](https://img.shields.io/badge/SSP%20Cloud-Lancer_avec_Jupyter-orange?logo=Jupyter&logoColor=orange)](${sspcloudJupyterLinkLauncher})`;
    } else {
        sspcloudJupyterLink = `<a href="${sspcloudJupyterLinkLauncher}" target="_blank" rel="noopener"><img src="https://img.shields.io/badge/SSP%20Cloud-Lancer_avec_Jupyter-orange?logo=Jupyter&logoColor=orange" alt="Onyxia"></a>`;
    }

    if (split === 4) {
        sspcloudJupyterLink += '<br>';
    }

    const sspcloudVscodeLinkLauncher = `https://datalab.sspcloud.fr/launcher/ide/vscode-${sspCloudService}${gpuSuffix}?autoLaunch=true&onyxia.friendlyName=%C2%AB${chapterNoExtension}%C2%BB&init.personalInit=%C2%ABhttps%3A%2F%2Fraw.githubusercontent.com%2Flinogaliana%2Fpython-datascientist%2Fmaster%2Fsspcloud%2Finit-vscode.sh%C2%BB&init.personalInitArgs=%C2%AB${onyxiaInitArgs.join('%20')}%C2%BB`;

    let sspcloudVscodeLink;
    if (type === "md") {
        sspcloudVscodeLink = `[![Onyxia](https://img.shields.io/badge/SSP%20Cloud-Lancer_avec_VSCode-blue?logo=visualstudiocode&logoColor=blue)](${sspcloudVscodeLinkLauncher})`;
    } else {
        sspcloudVscodeLink = `<a href="${sspcloudVscodeLinkLauncher}" target="_blank" rel="noopener"><img src="https://img.shields.io/badge/SSP%20Cloud-Lancer_avec_VSCode-blue?logo=visualstudiocode&logoColor=blue" alt="Onyxia"></a>`;
    }

    if (split === 5) {
        sspcloudVscodeLink += '<br>';
    }

    let colabLink;
    if (type === "md") {
        colabLink = `[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](http://colab.research.google.com/${githubRepoNotebooksSimplified}/blob/main${notebookRelPath})`;
    } else {
        colabLink = `<a href="https://colab.research.google.com/${githubRepoNotebooksSimplified}/blob/main${notebookRelPath}" target="_blank" rel="noopener"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>`;
    }

    if (split === 7) {
        colabLink += '<br>';
    }

    let vscodeLink;
    if (type === "md") {
        vscodeLink = `[![githubdev](https://img.shields.io/static/v1?logo=visualstudiocode&label=&message=Open%20in%20Visual Studio Code&labelColor=2c2c32&color=007acc&logoColor=007acc)](https://github.dev/linogaliana/python-datascientist-notebooks${notebookRelPath})`;
    } else {
        vscodeLink = `<a href="https://github.dev/linogaliana/python-datascientist-notebooks${notebookRelPath}" target="_blank" rel="noopener"><img src="https://img.shields.io/static/v1?logo=visualstudiocode&label=&message=Open%20in%20Visual%20Studio%20Code&labelColor=2c2c32&color=007acc&logoColor=007acc" alt="githubdev"></a></p>`;
    }

    const badges = [
        githubLink,
        sspcloudVscodeLink,
        sspcloudJupyterLink
    ];

    if (!onyxiaOnly) {
        badges.push(colabLink);
    }

    let result = badges.join("\n");

    if (type === "html") {
        result = `<p class="badges">${result}</p>`;
    }

    if (onyxiaOnly) {
        result = `${sspcloudJupyterLink}${sspcloudVscodeLink}`;
    }

    return result;
}


function printBadges({
    fpath,
    onyxiaOnly = false,
    split = 5,
    type = "html",
    sspCloudService = "python",
    GPU = false,
    correction = false
} = {}) {
    const badges = reminderBadges({
        sourceFile: fpath,
        type: type,
        split: split,
        onyxiaOnly: onyxiaOnly,
        sspCloudService: sspCloudService,
        GPU: GPU,
        correction: correction
    });

    return badges
}

Cette page approfondit les exercices présentés dans la section précédente. On va ainsi continuer notre exploration de la littérature anglophones :

Edgar Allan Poe, (EAP) ;
HP Lovecraft (HPL) ;
Mary Wollstonecraft Shelley (MWS).

Les données sont disponibles dans la base de données spooky.csv et peuvent être importées par Python en utilisant directement l’url https://github.com/GU4243-ADS/spring2018-project1-ginnyqg/raw/master/data/spooky.csv.

Le but va être dans un premier temps de regarder dans le détail les termes les plus fréquents utilisés par les auteurs, et les représenter graphiquement.

Ce notebook est librement inspiré de :

https://www.kaggle.com/enerrio/scary-nlp-with-spacy-and-keras
https://github.com/GU4243-ADS/spring2018-project1-ginnyqg
https://www.kaggle.com/meiyizi/spooky-nlp-and-topic-modelling-tutorial/notebook

La LDA est une technique d’estimation bayésienne. Le cours d’Alberto Brietti sur le sujet constitue une très bonne ressource pour comprendre les fondements de cette technique.

1 Librairies nécessaires

Cette page évoquera les principales librairies pour faire du NLP, notamment :

Hint

Comme dans la partie précédente, il faut télécharger quelques éléments pour que NTLK puisse fonctionner correctement. Pour cela, faire:

import nltk

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("genesis")
nltk.download("omw-1.4")

Installations préalables :

!pip install wordcloud
!pip install pyLDAvis

Requirement already satisfied: wordcloud in /opt/conda/lib/python3.12/site-packages (1.9.3)
Requirement already satisfied: numpy>=1.6.1 in /opt/conda/lib/python3.12/site-packages (from wordcloud) (1.26.4)
Requirement already satisfied: pillow in /opt/conda/lib/python3.12/site-packages (from wordcloud) (10.4.0)
Requirement already satisfied: matplotlib in /opt/conda/lib/python3.12/site-packages (from wordcloud) (3.9.2)
Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/lib/python3.12/site-packages (from matplotlib->wordcloud) (1.3.0)
Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.12/site-packages (from matplotlib->wordcloud) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.12/site-packages (from matplotlib->wordcloud) (4.54.1)
Requirement already satisfied: kiwisolver>=1.3.1 in /opt/conda/lib/python3.12/site-packages (from matplotlib->wordcloud) (1.4.7)
Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.12/site-packages (from matplotlib->wordcloud) (24.1)
Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/lib/python3.12/site-packages (from matplotlib->wordcloud) (3.1.4)
Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.12/site-packages (from matplotlib->wordcloud) (2.9.0)
Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.12/site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Requirement already satisfied: numpy>=1.24.2 in /opt/conda/lib/python3.12/site-packages (from pyLDAvis) (1.26.4)
Requirement already satisfied: scipy in /opt/conda/lib/python3.12/site-packages (from pyLDAvis) (1.13.0)
Requirement already satisfied: pandas>=2.0.0 in /opt/conda/lib/python3.12/site-packages (from pyLDAvis) (2.2.3)
Requirement already satisfied: joblib>=1.2.0 in /opt/conda/lib/python3.12/site-packages (from pyLDAvis) (1.4.2)
Requirement already satisfied: jinja2 in /opt/conda/lib/python3.12/site-packages (from pyLDAvis) (3.1.4)
Collecting numexpr (from pyLDAvis)
  Downloading numexpr-2.10.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Requirement already satisfied: scikit-learn>=1.0.0 in /opt/conda/lib/python3.12/site-packages (from pyLDAvis) (1.5.2)
Requirement already satisfied: gensim in /opt/conda/lib/python3.12/site-packages (from pyLDAvis) (4.3.2)
Requirement already satisfied: setuptools in /opt/conda/lib/python3.12/site-packages (from pyLDAvis) (74.1.2)
Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.12/site-packages (from pandas>=2.0.0->pyLDAvis) (2.9.0)
Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.12/site-packages (from pandas>=2.0.0->pyLDAvis) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.12/site-packages (from pandas>=2.0.0->pyLDAvis) (2024.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in /opt/conda/lib/python3.12/site-packages (from scikit-learn>=1.0.0->pyLDAvis) (3.5.0)
Requirement already satisfied: smart-open>=1.8.1 in /opt/conda/lib/python3.12/site-packages (from gensim->pyLDAvis) (7.0.5)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.12/site-packages (from jinja2->pyLDAvis) (2.1.5)
Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas>=2.0.0->pyLDAvis) (1.16.0)
Requirement already satisfied: wrapt in /opt/conda/lib/python3.12/site-packages (from smart-open>=1.8.1->gensim->pyLDAvis) (1.16.0)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/2.6 MB ? eta -:--:--   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.6/2.6 MB 55.5 MB/s eta 0:00:00
Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Downloading numexpr-2.10.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (408 kB)
Installing collected packages: funcy, numexpr, pyLDAvis
Successfully installed funcy-2.0 numexpr-2.10.1 pyLDAvis-3.4.1
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.

La liste des modules à importer est assez longue, la voici :

import nltk

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("genesis")
nltk.download("wordnet")
nltk.download("omw-1.4")

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# from IPython.display import display
import base64
import string
import re
import nltk

from collections import Counter
from time import time

# from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt

from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

[nltk_data] Downloading package stopwords to /github/home/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /github/home/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /github/home/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package genesis to /github/home/nltk_data...
[nltk_data]   Unzipping corpora/genesis.zip.
[nltk_data] Downloading package wordnet to /github/home/nltk_data...
[nltk_data] Downloading package omw-1.4 to /github/home/nltk_data...

2 Données utilisées

Si vous avez déjà lu la section précédente et importé les données, vous pouvez passer à la section suivante

Le code suivant permet d’importer le jeu de données spooky:

import pandas as pd

url = "https://github.com/GU4243-ADS/spring2018-project1-ginnyqg/raw/master/data/spooky.csv"
import pandas as pd

train = pd.read_csv(url, encoding="latin-1")
train.columns = train.columns.str.capitalize()

train["ID"] = train["Id"].str.replace("id", "")
train = train.set_index("Id")

Le jeu de données met ainsi en regard un auteur avec une phrase qu’il a écrite:

train.head()

	Text	Author	ID
Id
id26305	This process, however, afforded me no means of...	EAP	26305
id17569	It never once occurred to me that the fumbling...	HPL	17569
id11008	In his left hand was a gold snuff box, from wh...	EAP	11008
id27763	How lovely is spring As we looked from Windsor...	MWS	27763
id12958	Finding nothing else, not even gold, the Super...	HPL	12958

Les étapes de preprocessing sont expliquées dans le chapitre précédent. On applique les étapes suivantes :

Tokeniser
Retirer la ponctuation et les stopwords
Lemmatiser le texte

lemma = WordNetLemmatizer()

train_clean = (
    train.groupby(["ID", "Author"])
    .apply(lambda s: nltk.word_tokenize(" ".join(s["Text"])))
    .apply(lambda words: [word for word in words if word.isalpha()])
)

from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

train_clean = train_clean.apply(
    lambda words: [lemma.lemmatize(w) for w in words if not w in stop_words]
).reset_index(name="tokenized")

train_clean.head(2)

/tmp/ipykernel_1306/2725603950.py:5: DeprecationWarning:

DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.

	ID	Author	tokenized
0	00001	MWS	[Idris, well, content, resolve, mine]
1	00002	HPL	[I, faint, even, fainter, hateful, modernity, ...

3 Principe de la LDA (Latent Dirichlet Allocation)

Le modèle Latent Dirichlet Allocation (LDA) est un modèle probabiliste génératif qui permet de décrire des collections de documents de texte ou d’autres types de données discrètes. LDA fait partie d’une catégorie de modèles appelés “topic models”, qui cherchent à découvrir des structures thématiques cachées dans des vastes archives de documents.

Ceci permet d’obtenir des méthodes efficaces pour le traitement et l’organisation des documents de ces archives : organisation automatique des documents par sujet, recherche, compréhension et analyse du texte, ou même résumer des textes.

Aujourd’hui, ce genre de méthodes s’utilisent fréquemment dans le web, par exemple pour analyser des ensemble d’articles d’actualité, les regrouper par sujet, faire de la recommandation d’articles, etc.

La LDA est une méthode qui considère les corpus comme des mélanges de sujets et de mots. Chaque document peut être représenté comme le résultat d’un mélange :

de sujets
et, au sein de ces sujets, d’un choix de mots.

L’estimation des paramètres de la LDA passe par l’estimation des distributions des variables latentes à partir des données observées (posterior inference). Mathématiquement, on peut se représenter la LDA comme une technique de maximisation de log vraisemblance avec un algorithme EM (expectation maximisation) dans un modèle de mélange.

La matrice termes-documents qui sert de point de départ est la suivante :

	word_1	word_2	word_3	…	word_J
doc_1	3	0	1	…	0
…	…	…	…	…	…
doc_N	1	0	0	…	5

On dit que cette matrice est sparse (creuse en Français) car elle contient principalement des 0. En effet, un document n’utilise qu’une partie mineure du vocabulaire complet.

La LDA consiste à transformer cette matrice sparse document-terme en deux matrices de moindre dimension :

Une matrice document-sujet
Une matrice sujet-mots

En notant \(K_i\) le sujet \(i\). On obtient donc

Une matrice document-sujet ayant la structure suivante :

	K_1	K_2	K_3	…	K_M
doc_1	1	0	1	…	0
…	…	…	…	…	…
doc_N	1	1	1	…	0

Une matrice sujets-mots ayant la structure suivante :

	word_1	word_2	word_3	…	word_J
K_1	1	0	0	…	0
…	…	…	…	…	…
K_M	1	1	1	…	0

Ces deux matrices ont l’interprétation suivante :

La première nous renseigne sur la présence d’un sujet dans un document
La seconde nous renseigne sur la présence d’un mot dans un sujet

En fait, le principe de la LDA est de construire ces deux matrices à partir des fréquences d’apparition des mots dans le texte.

On va se concentrer sur Edgar Allan Poe.

corpus = train_clean[train_clean["Author"] == "EAP"]

4 Entraîner une LDA

Il existe plusieurs manières d’entraîner une LDA.

Nous allons utiliser Scikit ici avec la méthode LatentDirichletAllocation. Comme expliqué dans la partie modélisation :

On initialise le modèle ;
On le met à jour avec la méthode fit.

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(
    stop_words="english"
)  # Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(
    corpus["tokenized"].apply(lambda s: " ".join(s))
)

# Tweak the two parameters below
number_topics = 5
number_words = 10  # Create and fit the LDA model
lda = LatentDirichletAllocation(
    n_components=11,
    max_iter=5,
    learning_method="online",
    learning_offset=50.0,
    random_state=0,
    n_jobs=1,
)
lda.fit(count_data)

5 Visualiser les résultats

On peut déjà commencer par utiliser une fonction pour afficher les résultats :

# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i] for i in topic.argsort()[: -n_top_words - 1 : -1]]))


print_topics(lda, count_vectorizer, number_words)


Topic #0:
life man say direction business lady good mean turn held

Topic #1:
hand room door heard house open right window machine went

Topic #2:
thought shall time day mind night looked corpse hour say

Topic #3:
heart sure voice oh ha ah term apparent sake vessel

Topic #4:
thing minute sound look low distinct bug color dat dream

Topic #5:
quite account come really madame feeling sir monsieur wonder use

Topic #6:
little great eye say like man foot let way long

Topic #7:
said object friend nature word dupin letter movement replied possible

Topic #8:
body earth purpose atmosphere respect cause suppose chamber surface moon

Topic #9:
know human scarcely particular case drawer public principle appearance knowledge

Topic #10:
left manner thousand end saw state head people arm said

La représentation sous forme de liste de mots n’est pas la plus pratique…

On peut essayer de se représenter un wordcloud de chaque sujet pour mieux voir si cette piste est pertinente :

tf_feature_names = count_vectorizer.get_feature_names_out()


def wordcloud_lda(lda, tf_feature_names):

    fig, axs = plt.subplots(len(lda.components_) // 3 + 1, 3)

    for i in range(len(lda.components_)):
        corpus_lda = lda.components_[i]
        first_topic_words = [
            tf_feature_names[l] for l in corpus_lda.argsort()[: -50 - 1 : -1]
        ]
        k = i // 3
        j = i - k * 3
        wordcloud = WordCloud(
            stopwords=stop_words, background_color="black", width=2500, height=1800
        )
        wordcloud = wordcloud.generate(" ".join(first_topic_words))
        axs[k][j].set_title("Wordcloud pour le \nsujet {}".format(i))
        axs[k][j].axis("off")
        axs[k][j].imshow(wordcloud)

    r = len(lda.components_) % 3
    [
        fig.delaxes(axs[len(lda.components_) // 3, k - 1])
        for k in range(r + 1, 3 + 1)
        if r != 0
    ]


wc = wordcloud_lda(lda, tf_feature_names)
wc

wc

Le module pyLDAvis offre quelques visualisations bien pratiques lorsqu’on désire représenter de manière synthétique les résultats d’une LDA et observer la distribution sujet x mots.

Hint

Dans un notebook faire :

import pyLDAvis.lda_model

pyLDAvis.enable_notebook()

Pour les utilisateurs de Windows, il est nécessaire d’ajouter l’argument n_jobs = 1. Sinon, Python tente d’entraîner le modèle avec de la parallélisation. Le problème est que les processus sont des FORKs, ce que Windows ne supporte pas. Sur un système Unix (Linux, Mac OS), on peut se passer de cet argument.

#!pip install pyLDAvis #à faire en haut du notebook sur colab
import pyLDAvis
import pyLDAvis.lda_model

# pyLDAvis.enable_notebook()
vis_data = pyLDAvis.lda_model.prepare(lda, count_data, count_vectorizer, n_jobs=1)
pyLDAvis.display(vis_data)

Chaque bulle représente un sujet. Plus la bulle est grande, plus il y a de documents qui traitent de ce sujet.

Plus les barres sont loin les unes des autres, plus elles sont différentes. Un bon modèle aura donc tendance à avoir de grandes bulles qui ne se recoupent pas. Ce n’est pas vraiment le cas ici…

Les barres bleues représentent la fréquence de chaque mot dans le corpus.
Les barres rouges représentent une estimation du nombre de termes générés dans un sujet précis. La barre rouge la plus longue correspond au mot le plus utilisé dans ce sujet.

6 Références

Le poly d’Alberto Brietti

Informations additionnelles

environment files have been tested on.

Latest built version: 2024-11-20

Python version used:

'3.12.6 | packaged by conda-forge | (main, Sep 30 2024, 18:08:52) [GCC 13.3.0]'

Package	Version
affine	2.4.0
aiobotocore	2.15.1
aiohappyeyeballs	2.4.3
aiohttp	3.10.8
aioitertools	0.12.0
aiosignal	1.3.1
alembic	1.13.3
altair	5.4.1
aniso8601	9.0.1
annotated-types	0.7.0
appdirs	1.4.4
archspec	0.2.3
asttokens	2.4.1
attrs	24.2.0
babel	2.16.0
bcrypt	4.2.0
beautifulsoup4	4.12.3
black	24.8.0
blinker	1.8.2
blis	0.7.11
bokeh	3.5.2
boltons	24.0.0
boto3	1.35.23
botocore	1.35.23
branca	0.7.2
Brotli	1.1.0
cachetools	5.5.0
cartiflette	0.0.2
Cartopy	0.24.1
catalogue	2.0.10
cattrs	24.1.2
certifi	2024.8.30
cffi	1.17.1
charset-normalizer	3.3.2
click	8.1.7
click-plugins	1.1.1
cligj	0.7.2
cloudpathlib	0.20.0
cloudpickle	3.0.0
colorama	0.4.6
comm	0.2.2
commonmark	0.9.1
conda	24.9.1
conda-libmamba-solver	24.7.0
conda-package-handling	2.3.0
conda_package_streaming	0.10.0
confection	0.1.5
contextily	1.6.2
contourpy	1.3.0
cryptography	43.0.1
cycler	0.12.1
cymem	2.0.8
cytoolz	1.0.0
dask	2024.9.1
dask-expr	1.1.15
databricks-sdk	0.33.0
debugpy	1.8.6
decorator	5.1.1
Deprecated	1.2.14
diskcache	5.6.3
distributed	2024.9.1
distro	1.9.0
docker	7.1.0
duckdb	0.10.1
en-core-web-sm	3.7.1
entrypoints	0.4
et_xmlfile	2.0.0
exceptiongroup	1.2.2
executing	2.1.0
fastexcel	0.11.6
fastjsonschema	2.20.0
fiona	1.10.1
Flask	3.0.3
folium	0.17.0
fontawesomefree	6.6.0
fonttools	4.54.1
frozendict	2.4.4
frozenlist	1.4.1
fsspec	2023.12.2
funcy	2.0
gensim	4.3.2
geographiclib	2.0
geopandas	1.0.1
geoplot	0.5.1
geopy	2.4.1
gitdb	4.0.11
GitPython	3.1.43
google-auth	2.35.0
graphene	3.3
graphql-core	3.2.4
graphql-relay	3.2.0
graphviz	0.20.3
great-tables	0.12.0
greenlet	3.1.1
gunicorn	22.0.0
h2	4.1.0
hpack	4.0.0
htmltools	0.6.0
hyperframe	6.0.1
idna	3.10
imageio	2.36.0
importlib_metadata	8.5.0
importlib_resources	6.4.5
inflate64	1.0.0
ipykernel	6.29.5
ipython	8.28.0
itsdangerous	2.2.0
jedi	0.19.1
Jinja2	3.1.4
jmespath	1.0.1
joblib	1.4.2
jsonpatch	1.33
jsonpointer	3.0.0
jsonschema	4.23.0
jsonschema-specifications	2024.10.1
jupyter-cache	1.0.0
jupyter_client	8.6.3
jupyter_core	5.7.2
kaleido	0.2.1
kiwisolver	1.4.7
langcodes	3.5.0
language_data	1.3.0
lazy_loader	0.4
libmambapy	1.5.9
locket	1.0.0
lxml	5.3.0
lz4	4.3.3
Mako	1.3.5
mamba	1.5.9
mapclassify	2.8.1
marisa-trie	1.2.1
Markdown	3.6
markdown-it-py	3.0.0
MarkupSafe	2.1.5
matplotlib	3.9.2
matplotlib-inline	0.1.7
mdurl	0.1.2
menuinst	2.1.2
mercantile	1.2.1
mizani	0.11.4
mlflow	2.16.2
mlflow-skinny	2.16.2
msgpack	1.1.0
multidict	6.1.0
multivolumefile	0.2.3
munkres	1.1.4
murmurhash	1.0.10
mypy-extensions	1.0.0
narwhals	1.14.1
nbclient	0.10.0
nbformat	5.10.4
nest_asyncio	1.6.0
networkx	3.3
nltk	3.9.1
numexpr	2.10.1
numpy	1.26.4
opencv-python-headless	4.10.0.84
openpyxl	3.1.5
opentelemetry-api	1.16.0
opentelemetry-sdk	1.16.0
opentelemetry-semantic-conventions	0.37b0
OWSLib	0.28.1
packaging	24.1
pandas	2.2.3
paramiko	3.5.0
parso	0.8.4
partd	1.4.2
pathspec	0.12.1
patsy	0.5.6
Pebble	5.0.7
pexpect	4.9.0
pickleshare	0.7.5
pillow	10.4.0
pip	24.2
platformdirs	4.3.6
plotly	5.24.1
plotnine	0.13.6
pluggy	1.5.0
polars	1.8.2
preshed	3.0.9
prometheus_client	0.21.0
prometheus_flask_exporter	0.23.1
prompt_toolkit	3.0.48
protobuf	4.25.3
psutil	6.0.0
ptyprocess	0.7.0
pure_eval	0.2.3
py7zr	0.20.8
pyarrow	17.0.0
pyarrow-hotfix	0.6
pyasn1	0.6.1
pyasn1_modules	0.4.1
pybcj	1.0.2
pycosat	0.6.6
pycparser	2.22
pycryptodomex	3.21.0
pydantic	2.9.2
pydantic_core	2.23.4
Pygments	2.18.0
pyLDAvis	3.4.1
PyNaCl	1.5.0
pynsee	0.1.8
pyogrio	0.10.0
pyOpenSSL	24.2.1
pyparsing	3.1.4
pyppmd	1.1.0
pyproj	3.7.0
pyshp	2.3.1
PySocks	1.7.1
python-dateutil	2.9.0
python-dotenv	1.0.1
python-magic	0.4.27
pytz	2024.1
pyu2f	0.1.5
pywaffle	1.1.1
PyYAML	6.0.2
pyzmq	26.2.0
pyzstd	0.16.2
querystring_parser	1.2.4
rasterio	1.4.2
referencing	0.35.1
regex	2024.9.11
requests	2.32.3
requests-cache	1.2.1
retrying	1.3.4
rich	13.9.4
rpds-py	0.21.0
rsa	4.9
ruamel.yaml	0.18.6
ruamel.yaml.clib	0.2.8
s3fs	2023.12.2
s3transfer	0.10.2
scikit-image	0.24.0
scikit-learn	1.5.2
scipy	1.13.0
seaborn	0.13.2
setuptools	74.1.2
shapely	2.0.6
shellingham	1.5.4
six	1.16.0
smart-open	7.0.5
smmap	5.0.0
sortedcontainers	2.4.0
soupsieve	2.5
spacy	3.7.5
spacy-legacy	3.0.12
spacy-loggers	1.0.5
SQLAlchemy	2.0.35
sqlparse	0.5.1
srsly	2.4.8
stack-data	0.6.2
statsmodels	0.14.4
tabulate	0.9.0
tblib	3.0.0
tenacity	9.0.0
texttable	1.7.0
thinc	8.2.5
threadpoolctl	3.5.0
tifffile	2024.9.20
toolz	1.0.0
topojson	1.9
tornado	6.4.1
tqdm	4.66.5
traitlets	5.14.3
truststore	0.9.2
typer	0.13.1
typing_extensions	4.12.2
tzdata	2024.2
Unidecode	1.3.8
url-normalize	1.4.3
urllib3	1.26.20
wasabi	1.1.3
wcwidth	0.2.13
weasel	0.4.1
webdriver-manager	4.0.2
websocket-client	1.8.0
Werkzeug	3.0.4
wheel	0.44.0
wordcloud	1.9.3
wrapt	1.16.0
xgboost	2.1.1
xlrd	2.0.1
xyzservices	2024.9.0
yarl	1.13.1
yellowbrick	1.5
zict	3.0.0
zipp	3.20.2
zstandard	0.23.0

View file history

md`Ce fichier a été modifié __${table_commit.length}__ fois depuis sa création le ${creation_string} (dernière modification le ${last_modification_string})`

creation = d3.min(
  table_commit.map(d => new Date(d.Date))
)

last_modification = d3.max(
  table_commit.map(d => new Date(d.Date))
)

creation_string = creation.toLocaleString("fr", {
  "day": "numeric",
  "month": "long",
  "year": "numeric"
})

last_modification_string = last_modification.toLocaleString("fr", {
  "day": "numeric",
  "month": "long",
  "year": "numeric"
})

html`<div>${git_history_table}</div>`

html`<div>${git_history_plot}</div>`

SHA	Date	Author	Description
88d19d3	2024-10-11 12:27:17	lgaliana	Add NLTK punkt_tab
c5a9fb7	2024-07-22 09:56:18	Julien PRAMIL	Fix bug in LDA chapter (#525)
06d003a	2024-04-23 10:09:22	Lino Galiana	Continue la restructuration des sous-parties (#492)
005d89b	2023-12-20 17:23:04	Lino Galiana	Finalise l’affichage des statistiques Git (#478)
3fba612	2023-12-17 18:16:42	Lino Galiana	Remove some badges from python (#476)
4cd44f3	2023-12-11 17:37:50	Antoine Palazzolo	Relecture NLP (#474)
889a71b	2023-11-10 11:40:51	Antoine Palazzolo	Modification TP 3 (#443)
a771183	2023-10-09 11:27:45	Antoine Palazzolo	Relecture TD2 par Antoine (#418)
154f09e	2023-09-26 14:59:11	Antoine Palazzolo	Des typos corrigées par Antoine (#411)
3bdf3b0	2023-08-25 11:23:02	Lino Galiana	Simplification de la structure 🤓 (#393)
78ea2cb	2023-07-20 20:27:31	Lino Galiana	Change titles levels (#381)
d2a2773	2023-07-07 15:59:36	Lino Galiana	Retour du wordcloud (#372)
29ff3f5	2023-07-07 14:17:53	linogaliana	description everywhere
f21a24d	2023-07-02 10:58:15	Lino Galiana	Pipeline Quarto & Pages 🚀 (#365)
934149d	2023-02-13 11:45:23	Lino Galiana	get_feature_names is deprecated in scikit 1.0.X versions (#351)
f10815b	2022-08-25 16:00:03	Lino Galiana	Notebooks should now look more beautiful (#260)
d201e3c	2022-08-03 15:50:34	Lino Galiana	Pimp la homepage ✨ (#249)
bb38643	2022-06-08 16:59:40	Lino Galiana	Répare bug leaflet (#234)
299cff3	2022-06-08 13:19:03	Lino Galiana	Problème code JS suite (#233)
12965ba	2022-05-25 15:53:27	Lino Galiana	:launch: Bascule vers quarto (#226)
9c71d6e	2022-03-08 10:34:26	Lino Galiana	Plus d’éléments sur S3 (#218)
e94c1c5	2021-12-23 21:34:46	Lino Galiana	Un tutoriel sur les pipelines :tada: (#203)
09b60a1	2021-12-21 19:58:58	Lino Galiana	Relecture suite du NLP (#205)
495599d	2021-12-19 18:33:05	Lino Galiana	Des éléments supplémentaires dans la partie NLP (#202)
2a8809f	2021-10-27 12:05:34	Lino Galiana	Simplification des hooks pour gagner en flexibilité et clarté (#166)
2e4d586	2021-09-02 12:03:39	Lino Galiana	Simplify badges generation (#130)
4cdb759	2021-05-12 10:37:23	Lino Galiana	:sparkles: :star2: Nouveau thème hugo :snake: :fire: (#105)
7f9f97b	2021-04-30 21:44:04	Lino Galiana	🐳 + 🐍 New workflow (docker 🐳) and new dataset for modelization (2020 🇺🇸 elections) (#99)
f4e61ed	2020-12-10 15:00:12	Lino Galiana	Ajout partie LDA (#88)
d164635	2020-12-08 16:22:00	Lino Galiana	:books: Première partie NLP (#87)

git_history_table = Inputs.table(
  table_commit,
  {
    format: {
      SHA: x => md`[${x}](${github_repo}/commit/${x})`,
      Description: x => md`${replacePullRequestPattern(x, github_repo)}`,
      /*Date: x => x.toLocaleString("fr", {
        "month": "numeric",
        "day": "numeric",
        "year": "numeric"
        })
      */
    }
  }
)

git_history_plot = Plot.plot({
  marks: [
    Plot.ruleY([0], {stroke: "royalblue"}),
    Plot.dot(
          table_commit,
          Plot.pointerX({x: (d) => new Date(d.date), y: 0, stroke: "red"})),
    Plot.dot(table_commit, {x: (d) => new Date(d.Date), y: 0, fill: "royalblue"})
  ]
})

function replacePullRequestPattern(inputString, githubRepo) {
    // Use a regular expression to match the pattern #digit
    var pattern = /#(\d+)/g;

    // Replace the pattern with ${github_repo}/pull/#digit
    var replacedString = inputString.replace(pattern, '[#$1](' + githubRepo + '/pull/$1)');

    return replacedString;
}

github_repo = "https://github.com/linogaliana/python-datascientist"

table_commit = {

// Get the HTML table by its class name
var table = document.querySelector('.commit-table');

// Check if the table exists
if (table) {
    // Initialize an array to store the table data
    var dataArray = [];

    // Extract headers from the first row
    var headers = [];
    for (var i = 0; i < table.rows[0].cells.length; i++) {
        headers.push(table.rows[0].cells[i].textContent.trim());
    }

    // Iterate through the rows, starting from the second row
    for (var i = 1; i < table.rows.length; i++) {
        var row = table.rows[i];
        var rowData = {};

        // Iterate through the cells in the row
        for (var j = 0; j < row.cells.length; j++) {
            // Use headers as keys and cell content as values
            rowData[headers[j]] = row.cells[j].textContent.trim();
        }

        // Push the rowData object to the dataArray
        dataArray.push(rowData);
    }
  }

  return dataArray

}

// Get the element with class 'git-details'
{
  var gitDetails = document.querySelector('.commit-table');

  // Check if the element exists
  if (gitDetails) {
      // Hide the element
      gitDetails.style.display = 'none';
  }
}

Plot = require('@observablehq/plot@0.6.12/dist/plot.umd.min.js')

Retour au sommet

Citation

BibTeX

@book{galiana2023,
  author = {Galiana, Lino},
  title = {Python pour la data science},
  date = {2023},
  url = {https://pythonds.linogaliana.fr/},
  doi = {10.5281/zenodo.8229676},
  langid = {fr}
}

Veuillez citer ce travail comme suit :

Galiana, Lino. 2023. Python pour la data science. https://doi.org/10.5281/zenodo.8229676.