Add support for project-wide occurrences #123

Workflow file for this run

.github/workflows/fuzzy-ci.yml at cd12f45

	name: Fuzzy CI

	on:
	pull_request:
	branches: [master]
	types: [opened, synchronize, reopened, unlabeled, labeled]
	paths-ignore:
	- "**.md"
	- "**.txt"
	- ".git*"
	- "doc/**"
	- "emacs/**"
	- "vim/**"
	- "**/emacs-lint.yml"
	- "bench/**"
	- "upstream/**"
	- "tests/**"

	env:
	# Artifact names need to be consistent across jobs:
	BASE_BRANCH_ARTIFACT_NAME: base-branch-data-${{ github.event.pull_request.base.sha }}-pr${{ github.event.pull_request.number }}
	MERGE_BRANCH_ARTIFACT_NAME: merge-branch-data-${{ github.event.pull_request.base.sha }}-${{ github.event.pull_request.head.sha }}-pr${{ github.event.pull_request.number }}
	DIFF_ARTIFACT_NAME: diff-${{ github.event.pull_request.base.sha }}-${{ github.event.pull_request.head.sha }}

	# File names also need to be consistant across jobs:
	FULL_DIFF_FILE: full_responses.diff
	DISTILLED_DIFF_FILE: distilled_data.diff
	# Note: FULL_DATA_FILE and DISTILLED_DATA_FILE need to be the file names of the files generated by `merl-an behavior`
	FULL_DATA_FILE: full_responses.json
	DISTILLED_DATA_FILE: distilled_data.json

	# GitHub API related short-hands:
	GH_API_COMMENTS: ${{ github.event.pull_request.comments_url }}
	GH_API_LABELS: ${{ github.event.pull_request.issue_url }}/labels
	GH_API_ARTIFACTS: ${{ github.event.pull_request.base.repo.url }}/actions/artifacts

	# URL short-hands
	ACTIONS_RUNS_ENDPOINT: ${{ github.event.repository.html_url }}/actions/runs
	CURRENT_ACTION_URL: ${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }}

	# Irmin version and merl-an version need to be consistent for reproducibility (Irmin is used as the test code base to test `ocamlmerlin` on)
	IRMIN_VERSION: 3.9.0
	# TODO: Release merl-an and install a certain version instead of pinning it to a certain commit
	MERL_AN_SHA: 6411f0d3847e8b7e66362bcb1f9345a5d3e851ca

	# The compiler version used on the respective branches. It also needs to form part of Irmin's build cache key.
	# Bump either of these whenever the compiler version is bumped on either of the two branches.
	merge_branch_COMPILER_VERSION: ocaml-base-compiler.4.14.1
	base_branch_COMPILER_VERSION: ocaml-base-compiler.4.14.1

	jobs:
	data:
	name: Generate data
	runs-on: ubuntu-22.04
	if: >
	github.event.action == 'opened' \|\|
	github.event.action == 'synchronize' \|\|
	github.event.action == 'reopened' \|\|
	(
	github.event.action == 'unlabeled' &&
	github.event.label.name == 'fuzzy-diff-looks-good'
	)
	env:
	data_dir: data
	strategy:
	matrix:
	commit: ["merge_branch", "base_branch"]
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Checking out ${{ matrix.commit }}
	env:
	base_branch_sha: ${{ github.event.pull_request.base.sha }}
	merge_branch_sha: ${{ github.sha }}
	run: \|
	sha=$${{ matrix.commit }}_sha
	echo "Check out $sha"
	git checkout $sha

	- name: Get desired compiler version
	id: compiler
	run: \|
	v=$${{ matrix.commit }}_COMPILER_VERSION
	echo "version=$v" \| tee -a $GITHUB_OUTPUT

	- name: Install OCaml
	uses: ocaml/setup-ocaml@v2
	with:
	ocaml-compiler: ${{ steps.compiler.outputs.version }}
	dune-cache: true

	- name: Install merlin dependencies
	run: \|
	opam pin menhirLib 20201216 --no-action
	opam install . --deps-only --yes

	- name: Install merlin
	run: \|
	# Running `subst` to have the current commit in the data produced by `merl-an`
	opam exec -- dune subst
	opam exec -- dune build -p merlin-lib,dot-merlin-reader,merlin
	opam exec -- dune install -p merlin-lib,dot-merlin-reader,merlin

	- name: Pull irmin and its deps from cache if possible
	uses: actions/cache@v3
	id: irmin-cache
	with:
	path: irmin/
	key: os${{ runner.os }}+arch${{ runner.arch }}+${{ hashFiles('fuzzy-ci-helpers/irmin.3.9.0.opam.locked') }}+${{ env.IRMIN_VERSION }}+${{ steps.compiler.outputs.version }}

	- name: Download Irmin tarball
	if: steps.irmin-cache.outputs.cache-hit != 'true'
	run: \|
	wget https://github.com/mirage/irmin/releases/download/$IRMIN_VERSION/irmin-$IRMIN_VERSION.tbz

	- name: Create irmin dir
	if: steps.irmin-cache.outputs.cache-hit != 'true'
	run: mkdir -p irmin

	- name: Decompress Irmin tarball
	if: steps.irmin-cache.outputs.cache-hit != 'true'
	run: tar xvf irmin-$IRMIN_VERSION.tbz -C irmin --strip-components=1

	- name: Get Irmin's lock files
	if: steps.irmin-cache.outputs.cache-hit != 'true'
	run: \|
	cp .github/fuzzy-ci-helpers/irmin.3.9.0.opam.locked irmin/irmin.opam.locked

	- name: Install opam monorepo
	if: steps.irmin-cache.outputs.cache-hit != 'true'
	run: opam install opam-monorepo --yes

	- name: Pull in Irmin's dependencies
	if: steps.irmin-cache.outputs.cache-hit != 'true'
	run: \|
	git checkout ${{ github.sha }}
	opam monorepo pull --lockfile=irmin.opam.locked --yes
	working-directory: irmin

	- name: Prune Irmin
	if: steps.irmin-cache.outputs.cache-hit != 'true'
	run: \|
	rm -r examples/ bench/
	find test/ -mindepth 1 -maxdepth 1 -type d -not -name 'irmin-pack' -exec rm -r {} \;
	find src/ -mindepth 1 -maxdepth 1 -type d \
	-not -name 'irmin-pack' \
	-not -name 'irmin' \
	-not -name 'irmin-tezos' \
	-not -name ppx_irmin \
	-not -name irmin_test \
	-not -name irmin-test \
	-exec rm -r {} \;
	working-directory: irmin

	- name: Build Irmin
	run: \|
	opam exec -- dune build @check
	working-directory: irmin

	- name: Pull merl-an from cache if possible
	uses: actions/cache@v3
	id: merl-an-cache
	with:
	path: /usr/local/bin/merl-an
	key: os${{ runner.os }}+arch${{ runner.arch }}+merl-an-sha$MERL_AN_SHA

	- name: Install merl-an
	if: steps.merl-an-cache.outputs.cache-hit != 'true'
	run: opam pin -y merl-an https://github.com/pitag-ha/merl-an.git#$MERL_AN_SHA

	- name: Add merl-an to /usr/local/bin/
	if: steps.merl-an-cache.outputs.cache-hit != 'true'
	run: opam exec -- cp $GITHUB_WORKSPACE/_opam/bin/merl-an /usr/local/bin/merl-an

	- name: Create data set of Merlin responses
	run: \|
	# Note: The parameters with most influence on the execution time are
	# `--sample-size`: Number of samples per file defined by `--project` (and per local query).
	# `--project`: List of dirs/files to create samples on. In the case of a dirs, all ml(i) files recursively in the dir are used.
	# `--queries`: The `ocamlmerlin` queries that are being run.
	opam exec -- merl-an behavior \
	--queries=type-enclosing,occurrences,locate,complete-prefix,errors \
	--sample-size=30 \
	--data=${{ env.data_dir }} \
	--merlin=ocamlmerlin \
	--project=irmin/src/irmin,irmin/src/irmin-pack,irmin/test/irmin-pack

	- name: Remove varying components from data
	run: \|
	# TODO: This could be done on the `merl-an` side
	jq '.responses \|= map(del(.heap_mbytes, .timings, .cache))' \
	${{ env.data_dir }}/$FULL_DATA_FILE > temp.json && \
	mv temp.json ${{ env.data_dir }}/$FULL_DATA_FILE

	- name: Create name for data artifact
	id: artifact_name
	env:
	base_branch_artifact_name: ${{ env.BASE_BRANCH_ARTIFACT_NAME }}
	merge_branch_artifact_name: ${{ env. MERGE_BRANCH_ARTIFACT_NAME }}
	run: echo "name=$${{ matrix.commit }}_artifact_name" >> $GITHUB_OUTPUT

	- name: Upload data
	uses: actions/upload-artifact@v3
	with:
	name: ${{ steps.artifact_name.outputs.name }}
	path: ${{ env.data_dir }}

	- name: Compile diff tool
	if: ${{ matrix.commit == 'merge_branch' }}
	run: \|
	# Taking advantage that ocamlopt is installed on this runner: compile the diff tool here and share it with the next job where it's needed.
	# All GH runners are hosted on x86 machines and all jobs in this workflow declare the same OS, so this should workTM.
	opam exec -- ocamlopt -o create_diff .github/fuzzy-ci-helpers/create_diff.ml

	- name: Upload diff tool
	if: ${{ matrix.commit == 'merge_branch' }}
	uses: actions/upload-artifact@v3
	with:
	name: diff_tool
	path: create_diff

	diff:
	name: Generate diffs
	runs-on: ubuntu-22.04
	outputs:
	diff_exits: ${{steps.full_responses_diff.outputs.diff_exists}}
	needs: data
	env:
	base_data_dir: base_data
	merge_data_dir: merge_data
	diff_dir: diff
	steps:
	- name: Download base branch data
	uses: actions/download-artifact@v3
	with:
	name: ${{ env.BASE_BRANCH_ARTIFACT_NAME }}
	path: ${{ env.base_data_dir }}

	- name: Download merge branch data
	uses: actions/download-artifact@v3
	with:
	name: ${{ env.MERGE_BRANCH_ARTIFACT_NAME }}
	path: ${{ env.merge_data_dir }}

	- name: Create diff dir
	run: mkdir -p "$diff_dir"

	- name: Download diff tool
	uses: actions/download-artifact@v3
	with:
	name: diff_tool

	- name: Give diff tool execute permissions
	run: chmod +x create_diff

	- name: Generate full responses diff
	id: full_responses_diff
	run: \|
	jq -r -n \
	--slurpfile data1 "$base_data_dir/$FULL_DATA_FILE" \
	--slurpfile data2 "$merge_data_dir/$FULL_DATA_FILE" \
	'def process_json($branch; $data):
	($branch + ": " + $data.cmd + " (id=" + ($data.sample_id \| tostring) + ")"), $data;
	range($data1\|length) as $i \|
	process_json("base branch"; $data1[$i]),
	"--input-separator--",
	process_json("merge branch"; $data2[$i]),
	"--diff-cmd-separator--"' \
	\| ./create_diff "--input-separator--" "--diff-cmd-separator--" "$diff_dir/$FULL_DIFF_FILE"
	if [ -s "$diff_dir/$FULL_DIFF_FILE" ]; then
	echo "diff_exists=true" \| tee -a $GITHUB_OUTPUT
	else
	echo "diff_exists=false" \| tee -a $GITHUB_OUTPUT
	fi

	- name: Generate distilled data diff
	# If there's no full reponses diff, there also won't be a distilled data diff
	if: ${{ steps.full_responses_diff.outputs.diff_exists == 'true' }}
	run: \|
	jq -r -n \
	--slurpfile data1 "$base_data_dir/$DISTILLED_DATA_FILE" \
	--slurpfile data2 "$merge_data_dir/$DISTILLED_DATA_FILE" \
	'def process_json($branch; $data):
	($branch + ": " + $data.cmd + " (id=" + ($data.sample_id \| tostring) + ")"), $data;
	range($data1\|length) as $i \|
	process_json("base branch"; $data1[$i]),
	"--input-separator--",
	process_json("merge branch"; $data2[$i]),
	"--diff-cmd-separator--"' \
	\| ./create_diff "--input-separator--" "--diff-cmd-separator--" "$diff_dir/$DISTILLED_DIFF_FILE"

	- name: Upload diff(s)
	uses: actions/upload-artifact@v3
	with:
	name: ${{ env.DIFF_ARTIFACT_NAME }}
	path: ${{ env.diff_dir }}

	output:
	name: Evaluate diffs
	runs-on: ubuntu-22.04
	needs: diff
	env:
	earlier_diff_was_approved: ${{ contains(github.event.pull_request.labels.*.name, 'fuzzy-diff-looks-good') }}
	current_diff_exists: ${{ needs.diff.outputs.diff_exits }}
	diff_dir: ${{ needs.artifact_names.outputs.diff_dir }}
	steps:
	- name: Checkout
	uses: actions/checkout@v4

	- name: Download current diff(s)
	if: ${{ env.current_diff_exists == 'true' }}
	uses: actions/download-artifact@v3
	with:
	name: ${{ env.DIFF_ARTIFACT_NAME }}

	- name: Retreive hash of approved diff
	if: ${{ env.earlier_diff_was_approved == 'true' }}
	id: approved_diff
	run: \|
	msg_start=$(head -c 50 .github/fuzzy-ci-helpers/msg.txt)

	next_page_endpoint="$GH_API_COMMENTS?per_page=100&page=1"
	latest_comment="{}"

	while [ -n "$next_page_endpoint" ]; do
	latest_comment=$(
	curl -s -D "headers.txt" -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" "$next_page_endpoint" \|
	jq --arg msg_start "$msg_start" --argjson latest "{}" '
	map(
	select(
	(.body \| startswith($msg_start)) and .user.login == "github-actions[bot]"
	)
	) + [$latest] \| max_by(.created_at)'
	)

	next_page_endpoint=$(
	rg '^link:' headers.txt \|
	tr ',' '\n' \|
	rg 'rel="next"' \|
	cut -d'<' -f2 \|
	cut -d'>' -f1
	)
	done

	hash=$(echo "$latest_comment" \| jq '.body' -r \| grep '256-sha' \| awk '{print $NF}')
	echo "hash='$hash'" \| tee -a $GITHUB_OUTPUT

	- name: Analyze current diff
	id: current_diff
	run: \|
	hash=$(sha256sum "$FULL_DIFF_FILE" \| awk '{print $1}')
	echo "hash='$hash'" \| tee -a $GITHUB_OUTPUT

	- name: Write instruction to delete PR label
	# When this workflow is triggered by a PR from a fork, it doesn't have
	# the permissions to delete PR labels. Instead, we forward the
	# instruction to delete the label to fuzzy-ci-privileged.yml.
	if: ${{ env.earlier_diff_was_approved == 'true' && steps.approved_diff.outputs.hash != steps.current_diff.outputs.hash }}
	run: \|
	echo ${{ steps.approved_diff.outputs.hash }}
	echo ${{ steps.current_diff.outputs.hash }}
	mkdir -p ./forward
	jq -n \
	--arg instruction "delete_label" \
	--arg endpoint "$GH_API_LABELS" \
	'{instruction: $instruction, endpoint: $endpoint}' > ./forward/instruction.json

	- name: Upload instruction to delete label
	if: ${{ env.earlier_diff_was_approved == 'true' && steps.approved_diff.outputs.hash != steps.current_diff.outputs.hash }}
	uses: actions/upload-artifact@v3
	with:
	name: forwarded_instructions
	path: forward/

	- name: Return
	id: return
	env:
	github_api_labels_url: ${{ github.event.pull_request.base.repo.url }}/issues/${{ github.event.pull_request.number }}/labels
	run: \|
	print_head_of_diffs () {
	echo "--------beginning of full responses diff head--------"
	head -n 100 "$FULL_DIFF_FILE"
	echo "--------end of full responses diff head--------"
	echo "--------beginning of distilled data diff head--------"
	head -n 100 "$DISTILLED_DIFF_FILE"
	echo "--------end of distilled data diff head--------"
	}

	# FIXME (?): Are nested conditionals always so ugly in Bash, or is there a better way? Option types and the possibility to match would help a lot.
	LABEL_NAME=$(cat .github/fuzzy-ci-helpers/label_name.txt)
	if $earlier_diff_was_approved; then
	echo "Earlier diff was approved."
	if [ ${{ steps.current_diff.outputs.hash }} == ${{ steps.approved_diff.outputs.hash }} ]; then
	echo "This diff has been approved earlier. Everything ok."
	exit 0
	else
	print_head_of_diffs
	printf "The diff has changed since it was approved. So I'm removing the $LABEL_NAME label. If the new diff looks good, please set the label again.\n\
	There's a head of the new diffs printed above. The whole diffs can be downloaded from $CURRENT_ACTION_URL .\n\
	Previous sha256: ${{ steps.approved_diff.outputs.hash }}\n\
	Current sha256: ${{ steps.current_diff.outputs.hash }}"
	echo "delete_label=true" >> $GITHUB_OUTPUT
	exit 1
	fi
	else
	if $current_diff_exists; then
	print_head_of_diffs
	printf "There's a head of the diffs printed above. The diffs can be downloaded from $CURRENT_ACTION_URL .\nIf it looks good, please set the $LABEL_NAME label on the PR."
	exit 1
	else
	echo "No diff. All good."
	exit 0
	fi
	fi

	approve:
	name: Approve diff
	if: >
	github.event_name == 'pull_request' &&
	github.event.action == 'labeled' &&
	github.event.label.name == 'fuzzy-diff-looks-good'
	runs-on: ubuntu-22.04
	steps:
	- name: Retreive diff artifact meta-data
	id: diff_metadata
	run: \|
	all_artifacts=$(curl -sSL "$GH_API_ARTIFACTS")
	diff_artifact=$(echo "$all_artifacts" \| jq "first(.artifacts[] \| select(.name == \"$DIFF_ARTIFACT_NAME\") )")
	id=$(echo "$diff_artifact" \| jq ".id")
	echo "id=$id" \| tee -a $GITHUB_OUTPUT
	workflow_run=$(echo "$diff_artifact" \| jq ".workflow_run \| .id")
	echo "workflow_run=$workflow_run" \| tee -a $GITHUB_OUTPUT
	if [ -z $id ]; then
	echo "exists=false" \| tee -a $GITHUB_OUTPUT
	else
	echo "exists=true" \| tee -a $GITHUB_OUTPUT
	fi

	- name: Write instruction to delete PR label
	# When this workflow is triggered by a PR from a fork, it doesn't have
	# the permissions to delete PR labels. Instead, we forward the
	# instruction to delete the label to fuzzy-ci-privileged.yml.
	if: ${{ steps.diff_metadata.outputs.exists == 'false' }}
	run: \|
	mkdir -p ./forward
	jq -n \
	--arg instruction "delete_label" \
	--arg endpoint "$GH_API_LABELS" \
	'{instruction: $instruction, endpoint: $endpoint}' > ./forward/instruction.json

	- name: Upload instruction to delete label
	if: ${{ steps.diff_metadata.outputs.exists == 'false' }}
	uses: actions/upload-artifact@v3
	with:
	name: forwarded_instructions
	path: forward/

	- name: Fail due to diff not existing yet
	if: ${{ steps.diff_metadata.outputs.exists == 'false' }}
	run: \|
	printf "You seem to have tried to approve a diff that doesn't exist yet.\nWait for the diff to have been generated and then try again."
	exit 1

	- name: Download diff
	env:
	id: ${{ steps.diff_metadata.outputs.id }}
	run: \|
	# Doing this manually, since actions/download-artifact only works on the same workflow run on which the artifact was uploaded
	curl -sSLO -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" "$GH_API_ARTIFACTS/$id/zip" -D headers.txt

	- name: Unzip downloaded diff
	run: \|
	unzip zip \|\| (echo "Download of diff artifact failed" && cat headers.txt && cat zip && exit 1)

	- name: Compute full responses diff hash
	id: diff_hash
	run: \|
	hash=$(sha256sum "$FULL_DIFF_FILE" \| awk '{print $1}')
	echo "hash=$hash" \| tee -a $GITHUB_OUTPUT

	- name: Write instruction to comment on PR
	# When this workflow is triggered by a PR from a fork, it doesn't have
	# the permissions to comment on PRs. Instead, we forward the
	# instruction to comment on the PR to fuzzy-ci-privileged.yml.
	env:
	approved_diffs_workflow_run: ${{ steps.diff_metadata.outputs.workflow_run }}
	approved_diffs_hash: ${{ steps.diff_hash.outputs.hash }}
	run: \|
	mkdir -p ./forward
	jq -n \
	--arg instruction "comment" \
	--arg endpoint "$GH_API_COMMENTS" \
	--arg artifacts_url "$ACTIONS_RUNS_ENDPOINT/$approved_diffs_workflow_run" \
	--arg hash "$approved_diffs_hash" \
	'{instruction: $instruction, endpoint: $endpoint, artifacts_url: $artifacts_url, hash: $hash}' > ./forward/instruction.json

	- name: Upload instruction to comment on PR
	uses: actions/upload-artifact@v3
	with:
	name: forwarded_instructions
	path: forward/

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add support for project-wide occurrences #123

Workflow file

Add support for project-wide occurrences #123

Jobs

Run details

Workflow file for this run