Skip to content

Commit 01453b4

Browse files
authored
Merge pull request #63 from ChEB-AI/check_tokens_using_action
Add actions for token consistency and reader constants
2 parents 716432c + 6aa6ff1 commit 01453b4

File tree

3 files changed

+266
-0
lines changed

3 files changed

+266
-0
lines changed

.github/workflows/export_constants.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import json
2+
3+
from chebai.preprocessing.reader import (
4+
CLS_TOKEN,
5+
EMBEDDING_OFFSET,
6+
MASK_TOKEN_INDEX,
7+
PADDING_TOKEN_INDEX,
8+
)
9+
10+
# Define the constants you want to export
11+
# Any changes in the key names here should also follow the same change in verify_constants.yml code
12+
constants = {
13+
"EMBEDDING_OFFSET": EMBEDDING_OFFSET,
14+
"CLS_TOKEN": CLS_TOKEN,
15+
"PADDING_TOKEN_INDEX": PADDING_TOKEN_INDEX,
16+
"MASK_TOKEN_INDEX": MASK_TOKEN_INDEX,
17+
}
18+
19+
if __name__ == "__main__":
20+
# Write constants to a JSON file
21+
with open("constants.json", "w") as f:
22+
json.dump(constants, f)
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
name: Check consistency of tokens.txt file
2+
3+
# Define the file paths under `paths` to trigger this check only when specific files are modified.
4+
# This script will then execute checks only on files that have changed, rather than all files listed in `paths`.
5+
6+
# **Note** : To add a new token file for checks, include its path in:
7+
# - `on` -> `push` and `pull_request` sections
8+
# - `jobs` -> `check_tokens` -> `steps` -> Set global variable for multiple tokens.txt paths -> `TOKENS_FILES`
9+
10+
on:
11+
push:
12+
paths:
13+
- "chebai/preprocessing/bin/smiles_token/tokens.txt"
14+
- "chebai/preprocessing/bin/smiles_token_unlabeled/tokens.txt"
15+
- "chebai/preprocessing/bin/selfies/tokens.txt"
16+
- "chebai/preprocessing/bin/protein_token/tokens.txt"
17+
- "chebai/preprocessing/bin/graph_properties/tokens.txt"
18+
- "chebai/preprocessing/bin/graph/tokens.txt"
19+
- "chebai/preprocessing/bin/deepsmiles_token/tokens.txt"
20+
- "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt"
21+
pull_request:
22+
paths:
23+
- "chebai/preprocessing/bin/smiles_token/tokens.txt"
24+
- "chebai/preprocessing/bin/smiles_token_unlabeled/tokens.txt"
25+
- "chebai/preprocessing/bin/selfies/tokens.txt"
26+
- "chebai/preprocessing/bin/protein_token/tokens.txt"
27+
- "chebai/preprocessing/bin/graph_properties/tokens.txt"
28+
- "chebai/preprocessing/bin/graph/tokens.txt"
29+
- "chebai/preprocessing/bin/deepsmiles_token/tokens.txt"
30+
- "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt"
31+
32+
jobs:
33+
check_tokens:
34+
runs-on: ubuntu-latest
35+
36+
steps:
37+
- name: Checkout code
38+
uses: actions/checkout@v2
39+
40+
- name: Get list of changed files
41+
id: changed_files
42+
run: |
43+
git fetch origin dev
44+
45+
# Get the list of changed files compared to origin/dev and save them to a file
46+
git diff --name-only origin/dev > changed_files.txt
47+
48+
# Print the names of changed files on separate lines
49+
echo "Changed files:"
50+
while read -r line; do
51+
echo "Changed File name : $line"
52+
done < changed_files.txt
53+
54+
- name: Set global variable for multiple tokens.txt paths
55+
run: |
56+
# All token files that needs to checked must be included here too, same as in `paths`.
57+
TOKENS_FILES=(
58+
"chebai/preprocessing/bin/smiles_token/tokens.txt"
59+
"chebai/preprocessing/bin/smiles_token_unlabeled/tokens.txt"
60+
"chebai/preprocessing/bin/selfies/tokens.txt"
61+
"chebai/preprocessing/bin/protein_token/tokens.txt"
62+
"chebai/preprocessing/bin/graph_properties/tokens.txt"
63+
"chebai/preprocessing/bin/graph/tokens.txt"
64+
"chebai/preprocessing/bin/deepsmiles_token/tokens.txt"
65+
"chebai/preprocessing/bin/protein_token_3_gram/tokens.txt"
66+
)
67+
echo "TOKENS_FILES=${TOKENS_FILES[*]}" >> $GITHUB_ENV
68+
69+
- name: Process only changed tokens.txt files
70+
run: |
71+
# Convert the TOKENS_FILES environment variable into an array
72+
TOKENS_FILES=(${TOKENS_FILES})
73+
74+
# Iterate over each token file path
75+
for TOKENS_FILE_PATH in "${TOKENS_FILES[@]}"; do
76+
# Check if the current token file path is in the list of changed files
77+
if grep -q "$TOKENS_FILE_PATH" changed_files.txt; then
78+
echo "----------------------- Processing $TOKENS_FILE_PATH -----------------------"
79+
80+
# Get previous tokens.txt version
81+
git fetch origin dev
82+
git diff origin/dev -- $TOKENS_FILE_PATH > tokens_diff.txt || echo "No previous tokens.txt found for $TOKENS_FILE_PATH"
83+
84+
# Check for deleted or added lines in tokens.txt
85+
if [ -f tokens_diff.txt ]; then
86+
87+
# Check for deleted lines (lines starting with '-')
88+
deleted_lines=$(grep '^-' tokens_diff.txt | grep -v '^---' | sed 's/^-//' || true)
89+
if [ -n "$deleted_lines" ]; then
90+
echo "Error: Lines have been deleted from $TOKENS_FILE_PATH."
91+
echo -e "Deleted Lines: \n$deleted_lines"
92+
exit 1
93+
fi
94+
95+
# Check for added lines (lines starting with '+')
96+
added_lines=$(grep '^+' tokens_diff.txt | grep -v '^+++' | sed 's/^+//' || true)
97+
if [ -n "$added_lines" ]; then
98+
99+
# Count how many lines have been added
100+
num_added_lines=$(echo "$added_lines" | wc -l)
101+
102+
# Get last `n` lines (equal to num_added_lines) of tokens.txt
103+
last_lines=$(tail -n "$num_added_lines" $TOKENS_FILE_PATH)
104+
105+
# Check if the added lines are at the end of the file
106+
if [ "$added_lines" != "$last_lines" ]; then
107+
108+
# Find lines that were added but not appended at the end of the file
109+
non_appended_lines=$(diff <(echo "$added_lines") <(echo "$last_lines") | grep '^<' | sed 's/^< //')
110+
111+
echo "Error: New lines have been added to $TOKENS_FILE_PATH, but they are not at the end of the file."
112+
echo -e "Added lines that are not at the end of the file: \n$non_appended_lines"
113+
exit 1
114+
fi
115+
fi
116+
117+
if [ "$added_lines" == "" ]; then
118+
echo "$TOKENS_FILE_PATH validation successful: No lines were deleted, and no new lines were added."
119+
else
120+
echo "$TOKENS_FILE_PATH validation successful: No lines were deleted, and new lines were correctly appended at the end."
121+
fi
122+
else
123+
echo "No previous version of $TOKENS_FILE_PATH found."
124+
fi
125+
else
126+
echo "$TOKENS_FILE_PATH was not changed, skipping."
127+
fi
128+
done
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
name: Verify Constants
2+
3+
# Define the file paths under `paths` to trigger this check only when specific files are modified.
4+
# This script will then execute checks only on files that have changed, rather than all files listed in `paths`.
5+
6+
# **Note** : To add a new file for checks, include its path in:
7+
# - `on` -> `push` and `pull_request` sections
8+
# - `jobs` -> `verify-constants` -> `steps` -> Verify constants -> Add a new if else for your file, with check logic inside it.
9+
10+
11+
on:
12+
push:
13+
paths:
14+
- "chebai/preprocessing/reader.py"
15+
pull_request:
16+
paths:
17+
- "chebai/preprocessing/reader.py"
18+
19+
jobs:
20+
verify-constants:
21+
runs-on: ubuntu-latest
22+
strategy:
23+
fail-fast: false
24+
matrix:
25+
python-version: [
26+
# Only use 3.10 as of now
27+
# "3.9",
28+
"3.10",
29+
# "3.11"
30+
]
31+
32+
steps:
33+
- name: Checkout code
34+
uses: actions/checkout@v4
35+
36+
- name: Set PYTHONPATH
37+
run: echo "PYTHONPATH=$PWD" >> $GITHUB_ENV
38+
39+
- name: Get list of changed files
40+
id: changed_files
41+
run: |
42+
git fetch origin dev
43+
44+
# Get the list of changed files compared to origin/dev and save them to a file
45+
git diff --name-only origin/dev > changed_files.txt
46+
47+
# Print the names of changed files on separate lines
48+
echo "Changed files:"
49+
while read -r line; do
50+
echo "Changed File name : $line"
51+
done < changed_files.txt
52+
53+
- name: Set up Python ${{ matrix.python-version }}
54+
uses: actions/setup-python@v5
55+
with:
56+
python-version: ${{ matrix.python-version }}
57+
58+
- name: Install dependencies
59+
# Setting a fix version for torch due to an error with latest version (2.5.1)
60+
# ImportError: cannot import name 'T_co' from 'torch.utils.data.dataset'
61+
run: |
62+
python -m pip install --upgrade pip
63+
python -m pip install --upgrade pip setuptools wheel
64+
python -m pip install torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu
65+
python -m pip install -e .
66+
67+
- name: Export constants
68+
run: python .github/workflows/export_constants.py
69+
70+
- name: Load constants into environment variables
71+
id: load_constants
72+
# "E_" is appended as suffix to every constant, to protect overwriting other sys env variables with same name
73+
run: |
74+
constants=$(cat constants.json)
75+
echo "$constants" | jq -r 'to_entries|map("E_\(.key)=\(.value|tostring)")|.[]' >> $GITHUB_ENV
76+
77+
- name: Print all environment variables
78+
run: printenv
79+
80+
- name: Verify constants
81+
run: |
82+
file_name="chebai/preprocessing/reader.py"
83+
if grep -q "$file_name" changed_files.txt; then
84+
echo "----------------------- Checking file : $file_name ----------------------- "
85+
86+
# Define expected values for constants
87+
exp_embedding_offset="10"
88+
exp_cls_token="2"
89+
exp_padding_token_index="0"
90+
exp_mask_token_index="1"
91+
92+
# Debugging output to check environment variables
93+
echo "Current Environment Variables:"
94+
echo "E_EMBEDDING_OFFSET = $E_EMBEDDING_OFFSET"
95+
echo "Expected: $exp_embedding_offset"
96+
97+
# Verify constants match expected values
98+
if [ "$E_EMBEDDING_OFFSET" != "$exp_embedding_offset" ]; then
99+
echo "EMBEDDING_OFFSET ($E_EMBEDDING_OFFSET) does not match expected value ($exp_embedding_offset)!"
100+
exit 1
101+
fi
102+
if [ "$E_CLS_TOKEN" != "$exp_cls_token" ]; then
103+
echo "CLS_TOKEN ($E_CLS_TOKEN) does not match expected value ($exp_cls_token)!"
104+
exit 1
105+
fi
106+
if [ "$E_PADDING_TOKEN_INDEX" != "$exp_padding_token_index" ]; then
107+
echo "PADDING_TOKEN_INDEX ($E_PADDING_TOKEN_INDEX) does not match expected value ($exp_padding_token_index)!"
108+
exit 1
109+
fi
110+
if [ "$E_MASK_TOKEN_INDEX" != "$exp_mask_token_index" ]; then
111+
echo "MASK_TOKEN_INDEX ($E_MASK_TOKEN_INDEX) does not match expected value ($exp_mask_token_index)!"
112+
exit 1
113+
fi
114+
else
115+
echo "$file_name not found in changed_files.txt; skipping check."
116+
fi

0 commit comments

Comments
 (0)