From 7b586a70b181e6970ac5a874396514271b1c36b5 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Thu, 17 Oct 2024 17:42:48 +0200 Subject: [PATCH] docs: Add introduction page (#431) New documentation providing an overview of the repository, main practices, and links to key sections of the documentation. --------- Co-authored-by: Pierre Slamich Co-authored-by: Github copilot --- .github/labeler.yml | 22 ++-- .github/workflows/readme-writer.yml | 8 +- README.md | 84 +-------------- docs/index.md | 141 +++++++++++++++++++++++++ docs/linux-server.md | 7 ++ docs/virtual-machines.md | 31 ++++++ scripts/build_mkdocs.sh | 4 - scripts/readme-writer/readme_writer.py | 4 +- 8 files changed, 200 insertions(+), 101 deletions(-) create mode 100644 docs/index.md create mode 100644 docs/virtual-machines.md diff --git a/.github/labeler.yml b/.github/labeler.yml index 716e582d..cea99cb8 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -5,7 +5,7 @@ GitHub Actions: - changed-files: - any-glob-to-any-file: '.github/**/*' -๐Ÿชฆ postmortems: +"๐Ÿชฆ postmortems": - changed-files: - any-glob-to-any-file: 'docs/reports/**/*' @@ -49,20 +49,20 @@ moji: - changed-files: - any-glob-to-any-file: '**/*moji*' -๐Ÿ”’ SSL: +"๐Ÿ”’ SSL": - changed-files: - any-glob-to-any-file: '**/*ssl*' -๐Ÿณ docker: +"๐Ÿณ docker": - changed-files: - any-glob-to-any-file: '**/*docker*' -๐Ÿ“จ email: +"๐Ÿ“จ email": - changed-files: - any-glob-to-any-file: 'docs/mail.md' - any-glob-to-any-file: 'confs/common/systemd/system/email-failures@.service' -๐Ÿ“ˆ Matomo: +"๐Ÿ“ˆ Matomo": - changed-files: - any-glob-to-any-file: 'docs/matomo.md' @@ -82,11 +82,11 @@ proxmox: - changed-files: - any-glob-to-any-file: '**/*proxmox*' -๐Ÿงด Open Beauty Facts: +"๐Ÿงด Open Beauty Facts": - changed-files: - any-glob-to-any-file: '**/*obf*' -๐Ÿชถ Apache: +"๐Ÿชถ Apache": - changed-files: - any-glob-to-any-file: '**/*apache*' @@ -94,14 +94,14 @@ Odoo: - changed-files: - any-glob-to-any-file: '**/*odoo*' -๐Ÿท๏ธ Folksonomy Project: +"๐Ÿท๏ธ Folksonomy Project": - changed-files: - any-glob-to-any-file: '**/*folksonomy*' -๐Ÿ“ธ Open Products Facts: +"๐Ÿ“ธ Open Products Facts": - changed-files: - any-glob-to-any-file: '**/*openproductsfacts*' - -observability: + +"๐Ÿ”ญ observability": - changed-files: - any-glob-to-any-file: 'docs/observability.md' diff --git a/.github/workflows/readme-writer.yml b/.github/workflows/readme-writer.yml index 281b6acd..733f4181 100644 --- a/.github/workflows/readme-writer.yml +++ b/.github/workflows/readme-writer.yml @@ -20,12 +20,12 @@ jobs: - run: | pip install -r scripts/readme-writer/requirements.txt python scripts/readme-writer/readme_writer.py - cat README.md - - name: Commit README.md + cat docs/virtual-machines.md + - name: Commit docs/virtual-machines.md.md uses: stefanzweifel/git-auto-commit-action@v5 with: branch: develop repository: . - file_pattern: README.md - commit_message: Update README.md + file_pattern: docs/virtual-machines.md + commit_message: Update docs/virtual-machines.md commit_options: '--no-verify --signoff' diff --git a/README.md b/README.md index f48b3857..2efd210c 100644 --- a/README.md +++ b/README.md @@ -5,22 +5,6 @@ Sysadmin repository for the various parts of the Open Food Facts infrastructure. * We have a status page at https://status.openfoodfacts.org/, driven by https://github.com/openfoodfacts/openfoodfacts-upptime * We also have a [specific repository regarding monitoring](https://github.com/openfoodfacts/openfoodfacts-monitoring) -## Current priorities - -As of 2023 our current priorities are: - -* server migration with hardware upgrade and a clean containerized install - and zfs syncs -* better encrypted two-way communication between data centers (stunnel + https) -* backups checks - * testing backups through staging - * with automated deployment of new clones - * monitoring backups -* better monitoring - * more/better dashboards - * more active checks (monitoring - alerts) - * less false positives in alerts -* GPU server for inference and possibly one for training (not hosted) - may - ## Incident logs We started logging incidents by server: @@ -34,38 +18,8 @@ We started logging incidents by server: ## Documentation -Link to [Github Page](https://openfoodfacts.github.io/openfoodfacts-infrastructure/) - -The infrastructure documentation is as follows: - -- [Overview](./docs/overview.md) - -- [Mail](./docs/mail.md) - servers mail setup -- [Free Datacenter](./docs/free-datacenter.md) - Data center with main production servers -- [Linux Server](./docs/linux-server.md) - servers general setup -- [Mail](./docs/mail.md) - servers mail setup -- [An introduction to ZFS](./docs/zfs-overview.md) - ZFS is much used in our infrastructure -- [Proxmox](./docs/proxmox.md) - about proxmox management -- [CICD](./docs/cicd.md) - continuous integration and deployment -- [Observability](./docs/observability.md) - doc on monitoring / logs / etc. -- [Docker Onboarding](./docs/docker_onboarding.md) -- [Docker Infrastructure](./docs/docker_architecture.md) -- [Virtual Machines](#virtual-machines) - -The main services: -- [MongoDB](./docs/mongodb.md) the MongoDB database -- [Redis](./docs/redis.md) we also use Redis -- [Open Food Facts Query](./docs/openfoodfacts-query.md) service computing aggregations - -Some services: - -- [Discourse](./docs/discourse.md) for forum -- [NGINX reverse proxy](./docs/nginx-reverse-proxy.md) the reverse proxy for OVH services -- [Folksonomy](./docs/folksonomy.md) user editable labels and values -- [Matomo](./docs/matomo.md) for web analytics -- [Producers sftp](./docs/producers_sftp.md) to push product updates on producer platform -- [Zammad](./docs/zammad.md) for support -- [Odoo](./docs/odoo.md) the CRM +See our [Introduction](./docs/index.md), +also on [Github Page](https://openfoodfacts.github.io/openfoodfacts-infrastructure/) Also look at all install and post-mortem reports in [docs/reports](./docs/reports/) @@ -80,36 +34,6 @@ Also look at all install and post-mortem reports in [docs/reports](./docs/report * The meeting will handle Agenda items first, and if time permits, collaborative bug triage. -## Requests - -### Virtual Machines - - - -| Title |State | OS | CPU # | RAM | SSD (Local) | HDD (Remote) | Services | -|-------------------------------------------------------------------------------------------------------------------------------------------------|------|------------------------------|-----------------|---------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------| -|Wordpress test CT [#200] |open |Debian last Stable. | 3|[Explain if > 4 Gb.] |6 GB. | 0|Apache, PHP, Wordpress. | -|Monitoring - VM (QEMU host for docker) [#159] |open |Debian |* 4 CPUs |* 12G for we have influxdb and elastic-search that needs memory|* 30 Go disk (it is currently around 14G, but this will grow because we want to harvest more logs and more metrics)|* 50Go for ES backups|Docker, docker-compose | -|CT for new blog engine [#80] |open |Debian stable. |3 CPU. |2 GB. |10 GB |-- |LAMP + wordpress. | -|CT for Folksonomy Engine API dev [#76] |open |Default to Debian last Stable.|2 |1 GB |12 GB. |- |PostgreSQL, Python3. | -| Wild School Eco-Score project [#37] |open |Debian 10 |4 |16 Gb |30 Gb |0 |MongoDB | -| slack-org [#36] |open |Debian 10 |1 |1 Gb |10 Gb |None |Node.js | -| adminer-org [#29] |open |Debian 10 |2 |512 Mb. |4 Gb or even less. |0 |Nginx, PHP, Adminer. | -|Containers (x2) to build a replica set for OFF database [#28]|open |Debian 10 |4 |32 GB |50 GB (DB = 20 GB). |0 |Mongodb. | -| feedme-org [#27] |open |Debian 10 |3 |3 Gb. |15 Gb. |0 |PostgreSQL, Node.js, Nginx. | -| off-wiki-org [#21] |open |Debian 10 |2 |3 Gb |14 Gb. |14 Gb |Apache, PHP, MySQL, Mediawiki. | -|VM for the Community Portal [#124] |closed|Debian last Stable. |[Explain if > 4.]|[Explain if > 4 Gb.] |[Explain if > 32 Gb.] |[Explain if > 1 Tb.] |Python/Django, probably PostgreSQL, probably Apache and all Dockerized | -|VM for the Taxonomy Editor [#123] |closed|Debian last Stable. |[Explain if > 4.]|[Explain if > 4 Gb.] |[Explain if > 32 Gb.] |[Explain if > 1 Tb.] |Python, probably PostgreSQL, probably Apache for lightweight API serving from Docker | -|New VM QEMU for prod docker containers [#71] |closed|Debian 11 (stable) |8 |24 GB |256 GB. |- |Services deployed in production: | -| monitoring [#59] |closed|Debian 11 |4 |32GB |64GB |500GB (ovh3 mount) |Docker: ElasticSearch (Kibana?, Logstash?), Grafana, InfluxDB, Prometheus, Alertmanager| -| impactestimator-net [#55] |closed|Debian 11 |1 |1GB |1Gb |0 |https://github.com/openfoodfacts/impactestimator | -| robotoff-ml [#53] |closed|Debian 11 |8 |96GB (Tensorflow, ANN) |192GB [ML models] |100GB |Tensorflow + ElasticSearch | -| robotoff-net [#51] |closed|Debian 11 |4 |16GB (DB 4GB, Services 8GB) |92GB |0GB |Robotoff API + Schedulers + Workers, PostgreSQL DB | -| mongo-dev [#45] |closed|Debian 10 |2 |16GB |40GB | |MongoDB running in Docker | -| off-net [#41] |closed|Debian 10 |4 |16GB (PO needs > 6GB) |192GB |0GB |ProductOpener frontend + backend, MongoDB, PostgreSQL, Memcached | -| robotoff-dev [#40] |closed|Debian 10 |4 |8 Gb |32 Gb |100 Gb |robotoff, elastic search, tensorflow, postgresql | -| Matomo [#24] |closed|Debian 10 |No idea. |No idea. |No idea. |No idea. |LAMP | -| robotoff-org [#20] |closed|Debian 10 |4 |8 Gb |32 Gb |100 Gb |robotoff, elastic search, tensorflow, postgresql | - +## Virtual machines -  Request a VM +See [docs/virtual-machines.md](./docs/virtual-machines.md) \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..79d0da1e --- /dev/null +++ b/docs/index.md @@ -0,0 +1,141 @@ +# Introduction to Open Food Facts Infrastructure + +Welcome to the Open Food Facts Infrastructure documentation! +This repository is dedicated to managing the infrastructure that powers Open Food Facts and its related projects. Our goal is to provide a reliable, scalable, and secure infrastructure to support the various services and applications that make up the Open Food Facts ecosystem. + + +## Main Practices + +### Proxmox + +Proxmox is an open-source server virtualization management solution that we use extensively in our infrastructure. It allows us to manage virtual machines (VMs) and containers (CTs) efficiently. Proxmox provides a web-based interface for easy management and monitoring of our virtualized environment. + +Some software is installed / deployed in containers. +Docker deployments normally use a VM. + +For more details about our Proxmox setup and management, see [Proxmox](./proxmox.md). + +### Server Configuration Management + +We manage server configurations using Git. Each server has a clone of this repository, and configuration files are symlinked to the appropriate locations. This allows us to track changes, maintain consistency, and easily roll back to previous configurations if needed. For more details, see [Explanation on server configuration with git](./explain-server-config-in-git.md). + +### Continuous Integration and Continuous Delivery (CICD) + +We use a lot CICD process to automate the integration of code changes. This ensures all tests pass and desired quality standards are met. Our CICD process includes automated testing, building Docker containers, and, for some software, deploying to pre-production and production environments. For more information, see [CICD](./cicd.md). + +### Docker + +Docker is a one of the key component of our infrastructure. We use Docker to containerize our applications, ensuring consistency and ease of deployment. Docker Compose is used for orchestration, allowing us to manage multi-container applications with ease. For more details, + +see: +- [Docker at Open Food Facts](./docker.md). +- [Docker Onboarding](./docker_onboarding.md) +- [Docker Infrastructure](./docker_architecture.md) + +### Observability + + +Observability (monitoring, alerts) allows us to monitor the health and performance of our systems, detect issues early, and gain insights into the behavior of our applications. We use a combination of tools and practices to achieve observability, including logging, metrics, and tracing. For more details, see [Observability](./observability.md). + +We also have a [status page](https://status.openfoodfacts.org/), driven by [openfoodfacts-upptime](https://github.com/openfoodfacts/openfoodfacts-upptime) +and a [specific repository regarding monitoring](https://github.com/openfoodfacts/openfoodfacts-monitoring). + + +### ZFS + +We use a lot ZFS capabilities to store data on disk, and synchronize them accross servers thanks to Sanoid. + +See: +- [ZFS Overview](./zfs-overview.md): An introduction to ZFS. +- [Sanoid](./sanoid.md): Information about using Sanoid for ZFS snapshots. + + +## Our Servers + +Our infrastructure is hosted on multiple bare metal servers. +They are grouped in different data centers, usually forming a proxmox cluster. + +See [Infrastructure Overview](./overview.md) + +Some servers are graciously sponsored by [Fondation Free](https://www.fondation-free.fr/) (at [Scaleway](https://www.scaleway.com/)), [OVH](https://www.ovhcloud.com) and [Moji](https://moji.fr/) + +For more details about our servers and their configurations, see the following pages: + +- [Free Datacenter](./free-datacenter.md) + +- [Moji Datacenter](./moji-datacenter.md) + +### Virtual machines + +See [Virtual machines](./virtual-machines.md) + +## Production Architecture Overview + +Our production architecture consists of different services to run Open Food Facts and sibling projects. +Those are deployed on different servers and different containers and virtual machines. + +For a detailed overview of our production architecture, see [Production Architecture](./prod-architecture.md). + +Other tools supporting the community are deployed in containers, some times on the same servers. + +## Repository Structure + +The repository is organized into several directories, each serving a specific purpose: + +- `confs/`: Contains configuration files for various servers and services. +- `docker/`: Contains Docker-related files, including Docker Compose configurations. +- `docs/`: Contains documentation files, including this introduction. +- `docs/reports`: contains post mortem or log of installations. +- `scripts/`: Contains scripts for managing and maintaining the infrastructure. + + +## Services + + +### Important Services + +- [Mail](./mail.md): Details about our mail setup. +- [NGINX reverse proxy](./nginx-reverse-proxy.md): The reverse proxy for all services + + +### Services Supporting the Main Open Food Facts Deployment + +- [Product Opener](./product-opener.md): Backend that powers the Open Food Facts website and mobile apps. +- [Open Food Facts Query](./openfoodfacts-query.md): Service computing aggregations. +- [Postgres](./postgres.md): Information about our PostgreSQL setup and management. +- [MongoDB](./mongodb.md): Information about our MongoDB setup and management. +- [Redis](./redis.md): Details about our Redis setup and management. +- [Producers SFTP](./producers_sftp.md): To push product updates on producer platform. +- [Folksonomy](./folksonomy.md): User editable labels and values. + +### Tools for the Community + +- [Discourse](./discourse.md): For forum. +- [Matomo](./matomo.md): For web analytics. +- [Zammad](./zammad.md): For support. +- [Odoo](./odoo.md): The CRM. + +## Additional Resources + +Here are some additional resources that may be of interest: + +- [Disks](./disks.md): Information about disk management and best practices. +- [How to mitigate crawlers on prod](./how-to-mitigate-crawlers-on-prod.md): Guide on mitigating crawlers on production. +- [How to resync ZFS replication](./how-to-resync-zfs-replication.md): Guide on resyncing ZFS replication. +- [Linux Server](./linux-server.md): General setup for Linux servers. +- [Rclone](./rclone.md): Information about using rclone. + +### Incident logs + +- [Logs off1](./logs-off1.md): Incident logs for off1 server. +- [Logs off2](./logs-off2.md): Incident logs for off2 server. +- [Logs off3](./logs-off3.md): Incident logs for off3 server. +- [Logs ovh1](./logs-ovh1.md): Incident logs for ovh1 server. +- [Logs ovh2](./logs-ovh2.md): Incident logs for ovh2 server. +- [Logs ovh3](./logs-ovh3.md): Incident logs for ovh3 server. + +## You are welcome to contribute + +We hope you find this documentation helpful and welcoming. If you have any questions or need further assistance, please feel free to reach out to us. + +Happy contributing! diff --git a/docs/linux-server.md b/docs/linux-server.md index 375c8bd1..0c96b506 100644 --- a/docs/linux-server.md +++ b/docs/linux-server.md @@ -2,6 +2,8 @@ Here are some guidelines for linux servers. +**FIXME:** this doc is not up-to-date and must be reviewed. + Note that we have some servers (which are bare metal installs. While others are [proxmox hosts](./proxmox.md). On proxmox some VM are lxc containers, while other are QEMU VM. @@ -42,6 +44,11 @@ Remember, that docker as it's own chains that are not affected by `INPUT` and `O So it won't block a port exposed by docker. Use `DOCKER-USER` chain for that. see https://docs.docker.com/network/iptables/ +## Root .bashrc + +Most of the time root is created before auto completion package and so on are installed. +If your shell is not full featured as you are root, you might have to copy `/etc/skel/.bashrc` to `/root` + ## No color in shell Check your TERM variable: `echo $TERM`, it should be `xterm-256color` or `linux` diff --git a/docs/virtual-machines.md b/docs/virtual-machines.md new file mode 100644 index 00000000..88efae8f --- /dev/null +++ b/docs/virtual-machines.md @@ -0,0 +1,31 @@ +# Virtual Machines + + + +| Title |State | OS | CPU # | RAM | SSD (Local) | HDD (Remote) | Services | +|-------------------------------------------------------------------------------------------------------------------------------------------------|------|------------------------------|-----------------|---------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------| +|Wordpress test CT [#200] |open |Debian last Stable. | 3|[Explain if > 4 Gb.] |6 GB. | 0|Apache, PHP, Wordpress. | +|Monitoring - VM (QEMU host for docker) [#159] |open |Debian |* 4 CPUs |* 12G for we have influxdb and elastic-search that needs memory|* 30 Go disk (it is currently around 14G, but this will grow because we want to harvest more logs and more metrics)|* 50Go for ES backups|Docker, docker-compose | +|CT for new blog engine [#80] |open |Debian stable. |3 CPU. |2 GB. |10 GB |-- |LAMP + wordpress. | +|CT for Folksonomy Engine API dev [#76] |open |Default to Debian last Stable.|2 |1 GB |12 GB. |- |PostgreSQL, Python3. | +| Wild School Eco-Score project [#37] |open |Debian 10 |4 |16 Gb |30 Gb |0 |MongoDB | +| slack-org [#36] |open |Debian 10 |1 |1 Gb |10 Gb |None |Node.js | +| adminer-org [#29] |open |Debian 10 |2 |512 Mb. |4 Gb or even less. |0 |Nginx, PHP, Adminer. | +|Containers (x2) to build a replica set for OFF database [#28]|open |Debian 10 |4 |32 GB |50 GB (DB = 20 GB). |0 |Mongodb. | +| feedme-org [#27] |open |Debian 10 |3 |3 Gb. |15 Gb. |0 |PostgreSQL, Node.js, Nginx. | +| off-wiki-org [#21] |open |Debian 10 |2 |3 Gb |14 Gb. |14 Gb |Apache, PHP, MySQL, Mediawiki. | +|VM for the Community Portal [#124] |closed|Debian last Stable. |[Explain if > 4.]|[Explain if > 4 Gb.] |[Explain if > 32 Gb.] |[Explain if > 1 Tb.] |Python/Django, probably PostgreSQL, probably Apache and all Dockerized | +|VM for the Taxonomy Editor [#123] |closed|Debian last Stable. |[Explain if > 4.]|[Explain if > 4 Gb.] |[Explain if > 32 Gb.] |[Explain if > 1 Tb.] |Python, probably PostgreSQL, probably Apache for lightweight API serving from Docker | +|New VM QEMU for prod docker containers [#71] |closed|Debian 11 (stable) |8 |24 GB |256 GB. |- |Services deployed in production: | +| monitoring [#59] |closed|Debian 11 |4 |32GB |64GB |500GB (ovh3 mount) |Docker: ElasticSearch (Kibana?, Logstash?), Grafana, InfluxDB, Prometheus, Alertmanager| +| impactestimator-net [#55] |closed|Debian 11 |1 |1GB |1Gb |0 |https://github.com/openfoodfacts/impactestimator | +| robotoff-ml [#53] |closed|Debian 11 |8 |96GB (Tensorflow, ANN) |192GB [ML models] |100GB |Tensorflow + ElasticSearch | +| robotoff-net [#51] |closed|Debian 11 |4 |16GB (DB 4GB, Services 8GB) |92GB |0GB |Robotoff API + Schedulers + Workers, PostgreSQL DB | +| mongo-dev [#45] |closed|Debian 10 |2 |16GB |40GB | |MongoDB running in Docker | +| off-net [#41] |closed|Debian 10 |4 |16GB (PO needs > 6GB) |192GB |0GB |ProductOpener frontend + backend, MongoDB, PostgreSQL, Memcached | +| robotoff-dev [#40] |closed|Debian 10 |4 |8 Gb |32 Gb |100 Gb |robotoff, elastic search, tensorflow, postgresql | +| Matomo [#24] |closed|Debian 10 |No idea. |No idea. |No idea. |No idea. |LAMP | +| robotoff-org [#20] |closed|Debian 10 |4 |8 Gb |32 Gb |100 Gb |robotoff, elastic search, tensorflow, postgresql | + + +  Request a VM diff --git a/scripts/build_mkdocs.sh b/scripts/build_mkdocs.sh index 8c0d5d80..9bb97510 100755 --- a/scripts/build_mkdocs.sh +++ b/scripts/build_mkdocs.sh @@ -22,9 +22,6 @@ EOF # get group id to use it in the docker GID=$(id -g) -# copy README.md as the index but change links starting with ./docs/ to ./ -sed -e 's|(\./docs/|(./|g' README.md > docs/index.md - # we use minidocks capability to add entrypoint to install some pip package # we use also it's capability to change user and group id to avoid permissions problems docker run --rm \ @@ -36,7 +33,6 @@ docker run --rm \ # get exit code ! ERROR=$? # cleanup -rm $PIP_INSTALL docs/index.md if [[ -n $TMP_BUILD_DIR ]]; then rm -rf $TMP_BUILD_DIR; fi exit $ERROR \ No newline at end of file diff --git a/scripts/readme-writer/readme_writer.py b/scripts/readme-writer/readme_writer.py index 24d23bc8..6063ab90 100644 --- a/scripts/readme-writer/readme_writer.py +++ b/scripts/readme-writer/readme_writer.py @@ -75,7 +75,7 @@ def generate_issues_markdown(): def write_readme(issues_table: str): - readme_content = open('README.md', 'r').read() + readme_content = open('docs/virtual-machines.md', 'r').read() # Replace existing issues table with updated one content = re.sub(r'(?<=).+?(?=)', @@ -83,7 +83,7 @@ def write_readme(issues_table: str): readme_content, flags=re.DOTALL) - with open('README.md', 'w') as f: + with open('docs/virtual-machines.md', 'w') as f: f.write(content)