Python pour la data science

Lino Galiana

doi:10.5281/zenodo.8229676

path = window.location.pathname.replace(".html", ".qmd");
path_modified = (path.includes('en/content')) ? path.replace('en/content', 'content/en') : path
html`${printBadges({fpath: path_modified})}`

html`<div>${getConditionalHTML(path, true)}</div>`

function isBetweenSeptAndDec() {
    const now = new Date();
    const start = new Date(now.getFullYear(), 8, 1); // September 1st (month is zero-indexed)
    const end = new Date(now.getFullYear(), 11, 24); // December 24th (month is zero-indexed)
    
    return now >= start && now <= end;
}

function getConditionalHTML(path, print) {
    if (print === false) return ``
    if (isBetweenSeptAndDec()) {
        return md`<i>La correction sera visible prochainement sur cette page. En attendant, la liste des corrections déjà acccessibles est [ici](/content/annexes/corrections.html)</i>`; // Return an empty string if not between the dates
    } else {
        return html`
        <details>
            <summary>
                Pour ouvrir la version corrigée sous forme de <i>notebook</i>
            </summary>
            ${printBadges({ fpath: path, correction: true })}
        </details>
        `;
    }
}

function renderCorrection({ fpath, correction }) {
    if (correction) {
        return html`${printBadges({ fpath: fpath, correction: true })}`;
    } else {
        return html`<i>La correction sera visible prochainement sur cette page.</i>`;
    }
}

function reminderBadges({
    sourceFile = "content/01_toto.Rmd",
    type = ['md', 'html'],
    split = null,
    onyxiaOnly = false,
    sspCloudService = "python",
    GPU = false,
    correction = false
} = {}) {
    if (Array.isArray(type)) {
        type = type[0];
    }

    let notebook = sourceFile.replace(/(.Rmd|.qmd)/, ".ipynb");
    if (correction) {
        notebook = notebook.replace(/content/, "corrections");
    } else {
        notebook = notebook.replace(/content/, "notebooks");
    }

    const githubRepoNotebooksSimplified = "github/linogaliana/python-datascientist-notebooks";
    const githubAlias = githubRepoNotebooksSimplified.replace("github", "github.com");
    const githubRepoNotebooks = `https://${githubAlias}`;

    let githubLink ;

    if (notebook === "") {
        githubLink = githubRepoNotebooks;
    } else {
        githubLink = `${githubRepoNotebooks}/blob/main`;
    }

    const notebookRelPath = `/${notebook}`;
    const [section, chapter] = notebook.split("/").slice(-2);

    githubLink = `<a href="${githubLink}${notebookRelPath}" class="github"><i class="fab fa-github"></i></a>`;

    let sectionLatest = section.split("/").slice(-1)[0];
    const chapterNoExtension = chapter.replace(".ipynb", "");
    
    if (sourceFile.includes('en/')) {
        sectionLatest = `en/${sectionLatest}`;
    }

    const onyxiaInitArgs = [sectionLatest, chapterNoExtension];

    if (correction) {
        onyxiaInitArgs.push("correction");
    }

    const gpuSuffix = GPU ? "-gpu" : "";

    const sspcloudJupyterLinkLauncher = `https://datalab.sspcloud.fr/launcher/ide/jupyter-${sspCloudService}${gpuSuffix}?autoLaunch=true&onyxia.friendlyName=%C2%AB${chapterNoExtension}%C2%BB&init.personalInit=%C2%ABhttps%3A%2F%2Fraw.githubusercontent.com%2Flinogaliana%2Fpython-datascientist%2Fmaster%2Fsspcloud%2Finit-jupyter.sh%C2%BB&init.personalInitArgs=%C2%AB${onyxiaInitArgs.join('%20')}%C2%BB`;

    let sspcloudJupyterLink;
    if (type === "md") {
        sspcloudJupyterLink = `[![Onyxia](https://img.shields.io/badge/SSP%20Cloud-Lancer_avec_Jupyter-orange?logo=Jupyter&logoColor=orange)](${sspcloudJupyterLinkLauncher})`;
    } else {
        sspcloudJupyterLink = `<a href="${sspcloudJupyterLinkLauncher}" target="_blank" rel="noopener"><img src="https://img.shields.io/badge/SSP%20Cloud-Lancer_avec_Jupyter-orange?logo=Jupyter&logoColor=orange" alt="Onyxia"></a>`;
    }

    if (split === 4) {
        sspcloudJupyterLink += '<br>';
    }

    const sspcloudVscodeLinkLauncher = `https://datalab.sspcloud.fr/launcher/ide/vscode-${sspCloudService}${gpuSuffix}?autoLaunch=true&onyxia.friendlyName=%C2%AB${chapterNoExtension}%C2%BB&init.personalInit=%C2%ABhttps%3A%2F%2Fraw.githubusercontent.com%2Flinogaliana%2Fpython-datascientist%2Fmaster%2Fsspcloud%2Finit-vscode.sh%C2%BB&init.personalInitArgs=%C2%AB${onyxiaInitArgs.join('%20')}%C2%BB`;

    let sspcloudVscodeLink;
    if (type === "md") {
        sspcloudVscodeLink = `[![Onyxia](https://img.shields.io/badge/SSP%20Cloud-Lancer_avec_VSCode-blue?logo=visualstudiocode&logoColor=blue)](${sspcloudVscodeLinkLauncher})`;
    } else {
        sspcloudVscodeLink = `<a href="${sspcloudVscodeLinkLauncher}" target="_blank" rel="noopener"><img src="https://img.shields.io/badge/SSP%20Cloud-Lancer_avec_VSCode-blue?logo=visualstudiocode&logoColor=blue" alt="Onyxia"></a>`;
    }

    if (split === 5) {
        sspcloudVscodeLink += '<br>';
    }

    let colabLink;
    if (type === "md") {
        colabLink = `[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](http://colab.research.google.com/${githubRepoNotebooksSimplified}/blob/main${notebookRelPath})`;
    } else {
        colabLink = `<a href="https://colab.research.google.com/${githubRepoNotebooksSimplified}/blob/main${notebookRelPath}" target="_blank" rel="noopener"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>`;
    }

    if (split === 7) {
        colabLink += '<br>';
    }

    let vscodeLink;
    if (type === "md") {
        vscodeLink = `[![githubdev](https://img.shields.io/static/v1?logo=visualstudiocode&label=&message=Open%20in%20Visual Studio Code&labelColor=2c2c32&color=007acc&logoColor=007acc)](https://github.dev/linogaliana/python-datascientist-notebooks${notebookRelPath})`;
    } else {
        vscodeLink = `<a href="https://github.dev/linogaliana/python-datascientist-notebooks${notebookRelPath}" target="_blank" rel="noopener"><img src="https://img.shields.io/static/v1?logo=visualstudiocode&label=&message=Open%20in%20Visual%20Studio%20Code&labelColor=2c2c32&color=007acc&logoColor=007acc" alt="githubdev"></a></p>`;
    }

    const badges = [
        githubLink,
        sspcloudVscodeLink,
        sspcloudJupyterLink
    ];

    if (!onyxiaOnly) {
        badges.push(colabLink);
    }

    let result = badges.join("\n");

    if (type === "html") {
        result = `<p class="badges">${result}</p>`;
    }

    if (onyxiaOnly) {
        result = `${sspcloudJupyterLink}${sspcloudVscodeLink}`;
    }

    return result;
}


function printBadges({
    fpath,
    onyxiaOnly = false,
    split = 5,
    type = "html",
    sspCloudService = "python",
    GPU = false,
    correction = false
} = {}) {
    const badges = reminderBadges({
        sourceFile: fpath,
        type: type,
        split: split,
        onyxiaOnly: onyxiaOnly,
        sspCloudService: sspCloudService,
        GPU: GPU,
        correction: correction
    });

    return badges
}

Cette page approfondit certains aspects présentés dans les autres tutoriels. Il s’agit d’une suite d’exercice, avec corrections, pour présenter d’autres aspects du NLP ou pratiquer sur des données différentes.

1 NLP & Sentiment Analysis : Analyse textuelle des commentaires Trustpilot

Pour en savoir plus sur le sentiment analysis, vous pouvez regarder le sujet 5 de l’édition 2023 du Funathon, disponible sur Github ou sur le SSP Cloud. Très guidé, ce sujet vous permettra d’avoir les bases sur un projet lié à de l’analyse textuelle.

Là où la première partie consiste en quelques petits rappels sur le web scraping, la seconde consiste à analyser les données collectées. En particulier, étant donné que l’on dispose des commentaires laissés par les clients ainsi que leur note, il est intéressant de se demander dans quelle mesure il est possible de prédire la note laissée par un client selon l’évaluation associée. Pour ce faire, on va devoir coupler les méthodes de traitement du langage naturel (NLP) et celles d’apprentissage statistique (machine learning).

2 Exploration des libellés de l’openfood database

{{% box status=“exercise” title=“Exercise: les noms de produits dans l’openfood database” icon=“fas fa-pencil-alt” %}} L’objectif de cet exercice est d’analyser les termes les plus fréquents dans les noms de produits de l’openfood database. Au passage, cela permet de réviser les étapes de preprocessing (LIEN XXXXX) et d’explorer les enjeux de reconnaissance d’entités nommées. {{% /box %}}

Dans cet exercice:

tokenisation (nltk)
retrait des stop words (nltk)
nuage de mots (wordcloud)
reconnaissance du langage (fasttext)
reconnaissance d’entités nommées (spacy)

le tout sur l’OpenFood Database, une base de données alimentaire qui est enrichie de manière collaborative.

{{% box status=“hint” title=“Hint” icon=“fa fa-lightbulb” %}} Pour pouvoir utiliser les modèles pré-entraînés de spaCy, il faut les télécharger. La méthode préconisée est d’utiliser, depuis un terminal, la commande suivante

python -m spacy download fr_core_news_sm

Dans un notebook jupyter, il se peut qu’il soit nécessaire de relancer le kernel.

Si l’accès à la ligne de commande n’est pas possible, ou si la commande échoue, il est possible de télécharger le modèle pré-entraîné directement depuis une session Python

import spacy

spacy.cli.download("fr_core_news_sm")

Importer le modèle de reconnaissance de langage qui sera utilisé par la suite ainsi que le corpus Français utilisé par spacy

import tempfile
import os
import spacy

temp_dir = tempfile.NamedTemporaryFile()
temp_dir = temp_dir.name

os.system(
    "wget -O {} https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin".format(
        "%s.model.bin" % temp_dir
    )
)
spacy.cli.download("fr_core_news_sm")

Importer les données de l’openfood database à partir du code suivant

import pandas as pd
import urllib.request


urllib.request.urlretrieve(
    "https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv",
    "%s.openfood.csv" % temp_dir,
)
df_openfood = pd.read_csv(
    "%s.openfood.csv" % temp_dir,
    delimiter="\t",
    usecols=["product_name"],
    encoding="utf-8",
    dtype="str",
)

Ces données devraient avoir l’aspect suivant :

df_openfood.iloc[:2, :5]

Créer une fonction de nettoyage des noms de produits effectuant les étapes suivantes :

tokeniser le texte en question
retirer la ponctuation et les stopwords

Appliquer cette fonction à l’ensemble des noms de produits (variable product_name)

Effectuer un nuage de mot sur les libellés avant et après nettoyage pour comprendre la structure du corpus en question. Le résultat devrait avoir l’apparence suivante

import wordcloud as wc
import matplotlib.pyplot as plt


def graph_wordcloud(data, by=None, valueby=None, yvar="Text"):
    if (by is not None) & (valueby is not None):
        txt = data[data[by] == valueby][yvar].astype(str)
    else:
        txt = data[yvar].astype(str)
    all_text = " ".join([text for text in txt])
    wordcloud = wc.WordCloud(
        width=800, height=500, random_state=21, max_words=2000
    ).generate(all_text)
    return wordcloud


def graph_wordcloud_by(data, by, yvar="Text"):
    n_topics = data[by].unique().tolist()
    width = 20
    height = 80
    rows = len(n_topics) // 2
    cols = 2
    fig = plt.figure(figsize=(width, height))
    axes = []
    for i in range(cols * rows):
        b = graph_wordcloud(data, by=by, valueby=n_topics[i], yvar=yvar)
        axes.append(fig.add_subplot(rows, cols, i + 1))
        axes[-1].set_title("{}".format(n_topics[i]))
        plt.imshow(b)
        plt.axis("off")
        plt.savefig("{}.png".format(yvar), bbox_inches="tight")


def wordcount_words(data, yvar, by=None):
    plt.figure(figsize=(15, 15))
    if by is None:
        wordcloud = graph_wordcloud(data, yvar=yvar, by=by)
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.savefig("{}.png".format(yvar), bbox_inches="tight")
    else:
        graph_wordcloud_by(data, by=by, yvar=yvar)


wordcount_words(df_openfood, yvar="product_name")
wordcount_words(df_openfood, "tokenized")

Utiliser la librairie Fasttext pour extraire les noms de produits français

Appliquer le modèle téléchargé précédemment pour déterminer le langage
Ne récupérer que les libellés français

import fasttext

PRETRAINED_MODEL_PATH = "%s.model.bin" % temp_dir
model = fasttext.load_model(PRETRAINED_MODEL_PATH)
newcols = ["language", "score_language"]
df_openfood[newcols] = pd.DataFrame(
    df_openfood["product_name"]
    .astype(str)
    .apply(lambda s: list(model.predict(s)))
    .apply(lambda l: [l[0][0], l[1][0]])
    .tolist(),
    columns=newcols,
)
df_openfood["language"] = df_openfood["language"].str.replace("__label__", "")
df_openfood_french = df_openfood[df_openfood["language"] == "fr"]
df_openfood_french.head(2)

Visualiser avec spacy.displacy le résultat d’une reconnaissance d’entités nommées sur 50 données aléatoires. Cela vous semble-t-il satisfaisant ?

import spacy
import fr_core_news_sm

nlp = fr_core_news_sm.load()

example = " \n ".join(df_openfood_french["product_name"].astype("str").sample(50))

from spacy import displacy

html = displacy.render(nlp(example), style="ent", page=True)

print(html)

Récupérer dans un vecteur les entités nommées reconnues par spaCy. Regarder les entités reconnues dans les 20 premiers libellés de produits

x = []
for doc in nlp.pipe(
    df_openfood_french.head(20)["product_name"].astype("unicode"),
    disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"],
):
    # Do something with the doc here
    x.append([(ent.text, ent.label_) for ent in doc.ents])

x

Informations additionnelles

environment files have been tested on.

Python version used:

Package	Version
affine	2.4.0
aiobotocore	2.15.1
aiohappyeyeballs	2.4.3
aiohttp	3.10.8
aioitertools	0.12.0
aiosignal	1.3.1
alembic	1.13.3
altair	5.4.1
aniso8601	9.0.1
annotated-types	0.7.0
appdirs	1.4.4
archspec	0.2.3
asttokens	2.4.1
attrs	24.2.0
babel	2.16.0
bcrypt	4.2.0
beautifulsoup4	4.12.3
black	24.8.0
blinker	1.8.2
blis	0.7.11
bokeh	3.5.2
boltons	24.0.0
boto3	1.35.23
botocore	1.35.23
branca	0.7.2
Brotli	1.1.0
cachetools	5.5.0
cartiflette	0.0.2
Cartopy	0.24.1
catalogue	2.0.10
cattrs	24.1.2
certifi	2024.8.30
cffi	1.17.1
charset-normalizer	3.3.2
click	8.1.7
click-plugins	1.1.1
cligj	0.7.2
cloudpathlib	0.20.0
cloudpickle	3.0.0
colorama	0.4.6
comm	0.2.2
commonmark	0.9.1
conda	24.9.1
conda-libmamba-solver	24.7.0
conda-package-handling	2.3.0
conda_package_streaming	0.10.0
confection	0.1.5
contextily	1.6.2
contourpy	1.3.0
cryptography	43.0.1
cycler	0.12.1
cymem	2.0.8
cytoolz	1.0.0
dask	2024.9.1
dask-expr	1.1.15
databricks-sdk	0.33.0
debugpy	1.8.6
decorator	5.1.1
Deprecated	1.2.14
diskcache	5.6.3
distributed	2024.9.1
distro	1.9.0
docker	7.1.0
duckdb	0.10.1
en-core-web-sm	3.7.1
entrypoints	0.4
et_xmlfile	2.0.0
exceptiongroup	1.2.2
executing	2.1.0
fastexcel	0.11.6
fastjsonschema	2.20.0
fiona	1.10.1
Flask	3.0.3
folium	0.17.0
fontawesomefree	6.6.0
fonttools	4.54.1
frozendict	2.4.4
frozenlist	1.4.1
fsspec	2023.12.2
gensim	4.3.2
geographiclib	2.0
geopandas	1.0.1
geoplot	0.5.1
geopy	2.4.1
gitdb	4.0.11
GitPython	3.1.43
google-auth	2.35.0
graphene	3.3
graphql-core	3.2.4
graphql-relay	3.2.0
graphviz	0.20.3
great-tables	0.12.0
greenlet	3.1.1
gunicorn	22.0.0
h2	4.1.0
hpack	4.0.0
htmltools	0.6.0
hyperframe	6.0.1
idna	3.10
imageio	2.36.0
importlib_metadata	8.5.0
importlib_resources	6.4.5
inflate64	1.0.0
ipykernel	6.29.5
ipython	8.28.0
itsdangerous	2.2.0
jedi	0.19.1
Jinja2	3.1.4
jmespath	1.0.1
joblib	1.4.2
jsonpatch	1.33
jsonpointer	3.0.0
jsonschema	4.23.0
jsonschema-specifications	2024.10.1
jupyter-cache	1.0.0
jupyter_client	8.6.3
jupyter_core	5.7.2
kaleido	0.2.1
kiwisolver	1.4.7
langcodes	3.5.0
language_data	1.3.0
lazy_loader	0.4
libmambapy	1.5.9
locket	1.0.0
lxml	5.3.0
lz4	4.3.3
Mako	1.3.5
mamba	1.5.9
mapclassify	2.8.1
marisa-trie	1.2.1
Markdown	3.6
markdown-it-py	3.0.0
MarkupSafe	2.1.5
matplotlib	3.9.2
matplotlib-inline	0.1.7
mdurl	0.1.2
menuinst	2.1.2
mercantile	1.2.1
mizani	0.11.4
mlflow	2.16.2
mlflow-skinny	2.16.2
msgpack	1.1.0
multidict	6.1.0
multivolumefile	0.2.3
munkres	1.1.4
murmurhash	1.0.10
mypy-extensions	1.0.0
narwhals	1.14.1
nbclient	0.10.0
nbformat	5.10.4
nest_asyncio	1.6.0
networkx	3.3
nltk	3.9.1
numpy	1.26.4
opencv-python-headless	4.10.0.84
openpyxl	3.1.5
opentelemetry-api	1.16.0
opentelemetry-sdk	1.16.0
opentelemetry-semantic-conventions	0.37b0
OWSLib	0.28.1
packaging	24.1
pandas	2.2.3
paramiko	3.5.0
parso	0.8.4
partd	1.4.2
pathspec	0.12.1
patsy	0.5.6
Pebble	5.0.7
pexpect	4.9.0
pickleshare	0.7.5
pillow	10.4.0
pip	24.2
platformdirs	4.3.6
plotly	5.24.1
plotnine	0.13.6
pluggy	1.5.0
polars	1.8.2
preshed	3.0.9
prometheus_client	0.21.0
prometheus_flask_exporter	0.23.1
prompt_toolkit	3.0.48
protobuf	4.25.3
psutil	6.0.0
ptyprocess	0.7.0
pure_eval	0.2.3
py7zr	0.20.8
pyarrow	17.0.0
pyarrow-hotfix	0.6
pyasn1	0.6.1
pyasn1_modules	0.4.1
pybcj	1.0.2
pycosat	0.6.6
pycparser	2.22
pycryptodomex	3.21.0
pydantic	2.9.2
pydantic_core	2.23.4
Pygments	2.18.0
PyNaCl	1.5.0
pynsee	0.1.8
pyogrio	0.10.0
pyOpenSSL	24.2.1
pyparsing	3.1.4
pyppmd	1.1.0
pyproj	3.7.0
pyshp	2.3.1
PySocks	1.7.1
python-dateutil	2.9.0
python-dotenv	1.0.1
python-magic	0.4.27
pytz	2024.1
pyu2f	0.1.5
pywaffle	1.1.1
PyYAML	6.0.2
pyzmq	26.2.0
pyzstd	0.16.2
querystring_parser	1.2.4
rasterio	1.4.2
referencing	0.35.1
regex	2024.9.11
requests	2.32.3
requests-cache	1.2.1
retrying	1.3.4
rich	13.9.4
rpds-py	0.21.0
rsa	4.9
ruamel.yaml	0.18.6
ruamel.yaml.clib	0.2.8
s3fs	2023.12.2
s3transfer	0.10.2
scikit-image	0.24.0
scikit-learn	1.5.2
scipy	1.13.0
seaborn	0.13.2
setuptools	74.1.2
shapely	2.0.6
shellingham	1.5.4
six	1.16.0
smart-open	7.0.5
smmap	5.0.0
sortedcontainers	2.4.0
soupsieve	2.5
spacy	3.7.5
spacy-legacy	3.0.12
spacy-loggers	1.0.5
SQLAlchemy	2.0.35
sqlparse	0.5.1
srsly	2.4.8
stack-data	0.6.2
statsmodels	0.14.4
tabulate	0.9.0
tblib	3.0.0
tenacity	9.0.0
texttable	1.7.0
thinc	8.2.5
threadpoolctl	3.5.0
tifffile	2024.9.20
toolz	1.0.0
topojson	1.9
tornado	6.4.1
tqdm	4.66.5
traitlets	5.14.3
truststore	0.9.2
typer	0.13.1
typing_extensions	4.12.2
tzdata	2024.2
Unidecode	1.3.8
url-normalize	1.4.3
urllib3	1.26.20
wasabi	1.1.3
wcwidth	0.2.13
weasel	0.4.1
webdriver-manager	4.0.2
websocket-client	1.8.0
Werkzeug	3.0.4
wheel	0.44.0
wordcloud	1.9.3
wrapt	1.16.0
xgboost	2.1.1
xlrd	2.0.1
xyzservices	2024.9.0
yarl	1.13.1
yellowbrick	1.5
zict	3.0.0
zipp	3.20.2
zstandard	0.23.0

View file history

md`Ce fichier a été modifié __${table_commit.length}__ fois depuis sa création le ${creation_string} (dernière modification le ${last_modification_string})`

creation = d3.min(
  table_commit.map(d => new Date(d.Date))
)

last_modification = d3.max(
  table_commit.map(d => new Date(d.Date))
)

creation_string = creation.toLocaleString("fr", {
  "day": "numeric",
  "month": "long",
  "year": "numeric"
})

last_modification_string = last_modification.toLocaleString("fr", {
  "day": "numeric",
  "month": "long",
  "year": "numeric"
})

html`<div>${git_history_table}</div>`

html`<div>${git_history_plot}</div>`

SHA	Date	Author	Description
c9f9f8a	2024-04-24 15:09:35	Lino Galiana	Dark mode and CSS improvements (#494)
005d89b	2023-12-20 17:23:04	Lino Galiana	Finalise l’affichage des statistiques Git (#478)
3fba612	2023-12-17 18:16:42	Lino Galiana	Remove some badges from python (#476)
4cd44f3	2023-12-11 17:37:50	Antoine Palazzolo	Relecture NLP (#474)
889a71b	2023-11-10 11:40:51	Antoine Palazzolo	Modification TP 3 (#443)
a771183	2023-10-09 11:27:45	Antoine Palazzolo	Relecture TD2 par Antoine (#418)
154f09e	2023-09-26 14:59:11	Antoine Palazzolo	Des typos corrigées par Antoine (#411)
3bdf3b0	2023-08-25 11:23:02	Lino Galiana	Simplification de la structure 🤓 (#393)
29ff3f5	2023-07-07 14:17:53	linogaliana	description everywhere
f21a24d	2023-07-02 10:58:15	Lino Galiana	Pipeline Quarto & Pages 🚀 (#365)
f10815b	2022-08-25 16:00:03	Lino Galiana	Notebooks should now look more beautiful (#260)
12965ba	2022-05-25 15:53:27	Lino Galiana	:launch: Bascule vers quarto (#226)
9c71d6e	2022-03-08 10:34:26	Lino Galiana	Plus d’éléments sur S3 (#218)
2a8809f	2021-10-27 12:05:34	Lino Galiana	Simplification des hooks pour gagner en flexibilité et clarté (#166)
2e4d586	2021-09-02 12:03:39	Lino Galiana	Simplify badges generation (#130)
80877d2	2021-06-28 11:34:24	Lino Galiana	Ajout d’un exercice de NLP à partir openfood database (#98)

git_history_table = Inputs.table(
  table_commit,
  {
    format: {
      SHA: x => md`[${x}](${github_repo}/commit/${x})`,
      Description: x => md`${replacePullRequestPattern(x, github_repo)}`,
      /*Date: x => x.toLocaleString("fr", {
        "month": "numeric",
        "day": "numeric",
        "year": "numeric"
        })
      */
    }
  }
)

git_history_plot = Plot.plot({
  marks: [
    Plot.ruleY([0], {stroke: "royalblue"}),
    Plot.dot(
          table_commit,
          Plot.pointerX({x: (d) => new Date(d.date), y: 0, stroke: "red"})),
    Plot.dot(table_commit, {x: (d) => new Date(d.Date), y: 0, fill: "royalblue"})
  ]
})

function replacePullRequestPattern(inputString, githubRepo) {
    // Use a regular expression to match the pattern #digit
    var pattern = /#(\d+)/g;

    // Replace the pattern with ${github_repo}/pull/#digit
    var replacedString = inputString.replace(pattern, '[#$1](' + githubRepo + '/pull/$1)');

    return replacedString;
}

github_repo = "https://github.com/linogaliana/python-datascientist"

table_commit = {

// Get the HTML table by its class name
var table = document.querySelector('.commit-table');

// Check if the table exists
if (table) {
    // Initialize an array to store the table data
    var dataArray = [];

    // Extract headers from the first row
    var headers = [];
    for (var i = 0; i < table.rows[0].cells.length; i++) {
        headers.push(table.rows[0].cells[i].textContent.trim());
    }

    // Iterate through the rows, starting from the second row
    for (var i = 1; i < table.rows.length; i++) {
        var row = table.rows[i];
        var rowData = {};

        // Iterate through the cells in the row
        for (var j = 0; j < row.cells.length; j++) {
            // Use headers as keys and cell content as values
            rowData[headers[j]] = row.cells[j].textContent.trim();
        }

        // Push the rowData object to the dataArray
        dataArray.push(rowData);
    }
  }

  return dataArray

}

// Get the element with class 'git-details'
{
  var gitDetails = document.querySelector('.commit-table');

  // Check if the element exists
  if (gitDetails) {
      // Hide the element
      gitDetails.style.display = 'none';
  }
}

Plot = require('@observablehq/plot@0.6.12/dist/plot.umd.min.js')

Retour au sommet

Citation

BibTeX

@book{galiana2023,
  author = {Galiana, Lino},
  title = {Python pour la data science},
  date = {2023},
  url = {https://pythonds.linogaliana.fr/},
  doi = {10.5281/zenodo.8229676},
  langid = {fr}
}

Veuillez citer ce travail comme suit :

Galiana, Lino. 2023. Python pour la data science. https://doi.org/10.5281/zenodo.8229676.