Python pour la data science

Lino Galiana

doi:10.5281/zenodo.8229676

Site web du cours Python pour la data science , une introduction à Python pour la deuxième année du cursus d’ingénieur de l’ENSAE (Master 1).

L’ensemble du contenu de ce groupe est librement disponible ici ou sur Github et peut être testé sous forme de notebooks Jupyter.

Exemple avec l’introduction à Pandas

html`${printBadges({fpath: "content/manipulation/02_pandas_intro.qmd"})}`

Au programme:

Globalement, ce cours propose un contenu très complet pouvant autant satisfaire des débutants en data science que des personnes à la recherche de contenu plus avancé :

Manipulation de données : manipulation de données standards (Pandas), données géographiques (Geopandas), récupération de données (webscraping, API)…
Visualisation de données : visualisations classiques (Matplotlib, Seaborn), cartographie, visualisations réactives (Plotly, Folium)
Modélisation : machine learning (Scikit), économétrie
Traitement de données textuelles (NLP): découverte de la tokenisation avec NLTK et SpaCy, modélisation…
Introduction à la data science moderne : cloud computing, ElasticSearch, intégration continue…

L’ensemble du contenu de ce site s’appuie sur des données ouvertes, qu’il s’agisse de données françaises (principalement issues de la plateforme centralisatrice data.gouv ou du site web de l’Insee) ou de données américaines. Le programme est présenté de manière linéaire dans la partie supérieure de cette page (👆️) ou de manière désordonnée ci-dessous (👇️).

Un bon complément du contenu du site web est le cours que nous donnons avec Romain Avouac en dernière année de l’ENSAE plus tourné autour de la mise en production de projets data science : https://ensae-reproductibilite.github.io/

Informations additionnelles

Environnement Python

Ce site a été construit automatiquement par le biais d’une action Github utilisant le logiciel de publication reproductible Quarto (version 1.8.26).

L’environnement utilisé pour obtenir les résultats est reproductible par le biais d’uv. Le fichier pyproject.toml utilisé pour construire cet environnement est disponible sur le dépôt linogaliana/python-datascientist

pyproject.toml

[project]
name = "python-datascientist"
version = "0.1.0"
description = "Source code for Lino Galiana's Python for data science course"
readme = "README.md"
requires-python = ">=3.13,<3.14"
dependencies = [
    "altair>=6.0.0",
    "cartiflette",
    "contextily==1.6.2",
    "duckdb>=0.10.1",
    "folium>=0.19.6",
    "gdal==3.11.4",
    "graphviz==0.20.3",
    "great-tables>=0.12.0",
    "gt-extras>=0.0.8",
    "ipykernel>=6.29.5",
    "jupyter>=1.1.1",
    "jupyter-cache>=1.0.0",
    "kaleido>=0.2.1",
    "langchain-community>=0.3.27",
    "loguru==0.7.3",
    "markdown>=3.8",
    "nbclient>=0.10.0",
    "nbformat>=5.10.4",
    "nltk>=3.9.1",
    "pandas>=3.0",
    "pip>=25.1.1",
    "plotly>=6.1.2",
    "plotnine>=0.15",
    "polars>=1.8.2",
    "pyarrow>=17.0.0",
    "pynsee>=0.1.8",
    "python-dotenv>=1.0.1",
    "python-frontmatter>=1.1.0",
    "pywaffle>=1.1.1",
    "requests>=2.32.3",
    "scikit-image>=0.24.0",
    "scikit-learn>=1.8.0",
    "scipy>=1.13.0",
    "seaborn>=0.13.2",
    "selenium<4.39.0",
    "spacy>=3.8.4",
    "webdriver-manager>=4.0.2",
    "wordcloud==1.9.3",
]

[tool.uv.sources]
cartiflette = { git = "https://github.com/inseefrlab/cartiflette" }
gdal = [
  { index = "gdal-wheels", marker = "sys_platform == 'linux'" },
  { index = "geospatial_wheels", marker = "sys_platform == 'win32'" },
]

[[tool.uv.index]]
name = "geospatial_wheels"
url = "https://nathanjmcdougall.github.io/geospatial-wheels-index/"
explicit = true

[[tool.uv.index]]
name = "gdal-wheels"
url = "https://gitlab.com/api/v4/projects/61637378/packages/pypi/simple"
explicit = true

[dependency-groups]
dev = [
    "nb-clean>=4.0.1",
]

Pour utiliser exactement le même environnement (version de Python et packages), se reporter à la documentation d’uv.

Historique du fichier

md`Ce fichier a été modifié __${table_commit.length}__ fois depuis sa création le ${creation_string} (dernière modification le ${last_modification_string})`

html`<div>${git_history_table}</div>`

html`<div>${git_history_plot}</div>`

SHA	Date	Author	Description
2b4ce96e	2026-07-16 18:13:19	linogaliana	fix pb with numpy chapter
e0fa908a	2024-10-12 13:50:16	lgaliana	Mise en forme exogit

creation = d3.min(
  table_commit.map(d => new Date(d.Date))
)

last_modification = d3.max(
  table_commit.map(d => new Date(d.Date))
)

creation_string = creation.toLocaleString("fr", {
  "day": "numeric",
  "month": "long",
  "year": "numeric"
})

last_modification_string = last_modification.toLocaleString("fr", {
  "day": "numeric",
  "month": "long",
  "year": "numeric"
})

git_history_table = Inputs.table(
  table_commit,
  {
    format: {
      SHA: x => md`[${x}](${github_repo}/commit/${x})`,
      Description: x => md`${replacePullRequestPattern(x, github_repo)}`,
      /*Date: x => x.toLocaleString("fr", {
        "month": "numeric",
        "day": "numeric",
        "year": "numeric"
        })
      */
    }
  }
)

git_history_plot = Plot.plot({
  marks: [
    Plot.ruleY([0], {stroke: "royalblue"}),
    Plot.dot(
          table_commit,
          Plot.pointerX({x: (d) => new Date(d.date), y: 0, stroke: "red"})),
    Plot.dot(table_commit, {x: (d) => new Date(d.Date), y: 0, fill: "royalblue"})
  ]
})

function replacePullRequestPattern(inputString, githubRepo) {
    // Use a regular expression to match the pattern #digit
    var pattern = /#(\d+)/g;

    // Replace the pattern with ${github_repo}/pull/#digit
    var replacedString = inputString.replace(pattern, '[#$1](' + githubRepo + '/pull/$1)');

    return replacedString;
}

github_repo = "https://github.com/linogaliana/python-datascientist"

table_commit = {

// Get the HTML table by its class name
var table = document.querySelector('.commit-table');

// Check if the table exists
if (table) {
    // Initialize an array to store the table data
    var dataArray = [];

    // Extract headers from the first row
    var headers = [];
    for (var i = 0; i < table.rows[0].cells.length; i++) {
        headers.push(table.rows[0].cells[i].textContent.trim());
    }

    // Iterate through the rows, starting from the second row
    for (var i = 1; i < table.rows.length; i++) {
        var row = table.rows[i];
        var rowData = {};

        // Iterate through the cells in the row
        for (var j = 0; j < row.cells.length; j++) {
            // Use headers as keys and cell content as values
            rowData[headers[j]] = row.cells[j].textContent.trim();
        }

        // Push the rowData object to the dataArray
        dataArray.push(rowData);
    }
  }

  return dataArray

}

// Get the element with class 'git-details'
{
  var gitDetails = document.querySelector('.commit-table');

  // Check if the element exists
  if (gitDetails) {
      // Hide the element
      gitDetails.style.display = 'none';
  }
}

Plot = require('@observablehq/plot@0.6.12/dist/plot.umd.min.js')

Retour au sommet

Citation

BibTeX

@book{galiana2025,
  author = {Galiana, Lino},
  title = {Python pour la data science},
  date = {2025},
  url = {https://pythonds.linogaliana.fr/},
  doi = {10.5281/zenodo.8229676},
  langid = {fr}
}

Veuillez citer ce travail comme suit :

Galiana, Lino. 2025. Python pour la data science. https://doi.org/10.5281/zenodo.8229676.