Skip to content

Commit

Permalink
[CICO-6] added pre-commit, added ruff Github action, added ruff pre-c…
Browse files Browse the repository at this point in the history
…ommit, started working on fixing add_item method, updated requirements.txt, started working on PyPi structure
  • Loading branch information
jophals committed Nov 21, 2024
1 parent a1b73bd commit 0039256
Show file tree
Hide file tree
Showing 12 changed files with 161 additions and 87 deletions.
8 changes: 8 additions & 0 deletions .github/workflows/github-actions-demo.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: Ruff
on: [ push, pull_request ]
jobs:
ruff:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: astral-sh/ruff-action@v1
14 changes: 14 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.1.0
hooks:
- id: check-yaml
- id: check-ast
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.7.4
hooks:
- id: ruff
args: [ --fix ]
- id: ruff-format
7 changes: 0 additions & 7 deletions authentication.py

This file was deleted.

10 changes: 10 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cfgv==3.4.0
distlib==0.3.9
filelock==3.16.1
identify==2.6.2
nodeenv==1.9.1
platformdirs==4.3.6
pre_commit==4.0.1
PyYAML==6.0.2
ruff==0.7.4
virtualenv==20.27.1
64 changes: 42 additions & 22 deletions CitesphereConnector.py → src/CitesphereConnector.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,30 +11,45 @@ def __init__(self, api, auth_token_object):
self.handle_api_params()

def validate(self):
if not hasattr(self.auth_token_object, 'authType'):
raise AttributeError('Missing authType attribute')

if not hasattr(self.auth_token_object, 'headers'):
raise AttributeError('Missing headers attribute')

if not hasattr(self.auth_token_object, 'access_token'):
if not hasattr(self.auth_token_object, 'username') and not hasattr(self.auth_token_object, 'password'):
raise AttributeError('Either username and password or access_token should be present')

if not self.auth_token_object.authType == 'oauth' and not self.auth_token_object.authType == 'basic':
if not hasattr(self.auth_token_object, "authType"):
raise AttributeError("Missing authType attribute")

if not hasattr(self.auth_token_object, "headers"):
raise AttributeError("Missing headers attribute")

if not hasattr(self.auth_token_object, "access_token"):
if not hasattr(self.auth_token_object, "username") and not hasattr(
self.auth_token_object, "password"
):
raise AttributeError(
"Either username and password or access_token should be present"
)

if (
not self.auth_token_object.authType == "oauth"
and not self.auth_token_object.authType == "basic"
):
raise Exception("authType should be either oauth or basic")

def handle_api_params(self):
if self.auth_token_object.authType == "oauth":
self.auth_token_object.headers = {'Authorization': 'Bearer {}'.format(self.auth_token_object.access_token)}
self.auth_token_object.headers = {
"Authorization": "Bearer {}".format(self.auth_token_object.access_token)
}
elif self.auth_token_object.authType == "basic":
auth_str = '{}:{}'.format(self.auth_token_object.username, self.auth_token_object.password)
auth_b64 = base64.b64encode(auth_str.encode('ascii'))
self.auth_token_object.headers = {'Authorization': 'Basic {}'.format(auth_b64)}
auth_str = "{}:{}".format(
self.auth_token_object.username, self.auth_token_object.password
)
auth_b64 = base64.b64encode(auth_str.encode("ascii"))
self.auth_token_object.headers = {
"Authorization": "Basic {}".format(auth_b64)
}

def execute_command(self, url):
try:
response = urllib2.urlopen(urllib2.Request(url, headers=self.auth_token_object.headers))
response = urllib2.urlopen(
urllib2.Request(url, headers=self.auth_token_object.headers)
)
data = json.load(response)

return data
Expand All @@ -48,7 +63,7 @@ def get_user(self):
def check_test(self):
url = f"{self.api}/v1/test"
return self.execute_command(url)

def check_access(self, document_id):
url = f"{self.api}/files/giles/{document_id}/access/check"
return self.execute_command(url)
Expand All @@ -75,20 +90,25 @@ def get_collections(self, zotero_group_id):
return self.execute_command(url)

def get_collection_items(self, zotero_group_id, collection_id, page_number=0):
url = f"{self.api}/v1/groups/{zotero_group_id}/collections/{collection_id}/items"
url = (
f"{self.api}/v1/groups/{zotero_group_id}/collections/{collection_id}/items"
)
if page_number:
url = f"{url}?&page={page_number}"
url = f"{url}?&page={page_number}"
return self.execute_command(url)

def get_item_info(self, zotero_group_id, item_id):
url = f"{self.api}/v1/groups/{zotero_group_id}/items/{item_id}"
return self.execute_command(url)

def get_collections_by_collection_id(self, zotero_group_id, collection_id):
url = f"{self.api}/groups/{zotero_group_id}/collections/{collection_id}/collections"
return self.execute_command(url)

def add_item(self, group_id):
def add_item(self, group_id, file_path):
# with open(file_path, "rb") as file:
# files = {"file": file}
# response = requests.post(url, files=files)

url = f"{self.api}/v1/groups/{group_id}/items/create"
return self.execute_command(url)

Binary file not shown.
Binary file added src/__pycache__/authentication.cpython-310.pyc
Binary file not shown.
Binary file added src/__pycache__/constants.cpython-310.pyc
Binary file not shown.
14 changes: 14 additions & 0 deletions src/authentication.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
class AuthObject:
def __init__(
self,
authType=None,
headers=None,
username=None,
password=None,
access_token=None,
):
self.authType = authType
self.headers = headers
self.username = username
self.password = password
self.access_token = access_token
2 changes: 1 addition & 1 deletion constants.py → src/constants.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
CITESPHERE_API_URL = "https://diging-dev.asu.edu/citesphere-review/api"
MAX_SIZE = 50
GILES_URL = f"https://diging.asu.edu/geco-giles-staging/api/v2/resources/files/"
GILES_URL = "https://diging.asu.edu/geco-giles-staging/api/v2/resources/files/"
109 changes: 62 additions & 47 deletions csvGenerator.ipynb → src/csvGenerator.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@
"import os\n",
"import csv\n",
"import math\n",
"import random\n",
"import requests\n",
"import constants as const \n",
"import constants as const\n",
"from authentication import AuthObject\n",
"from CitesphereConnector import CitesphereConnector"
]
Expand All @@ -25,15 +24,16 @@
"outputs": [],
"source": [
"auth_object = AuthObject()\n",
"auth_object.authType = 'oauth'\n",
"auth_object.authType = \"oauth\"\n",
"auth_object.access_token = \"f5f7e899-30d3-4531-8b2e-8009e9969ed4\"\n",
"citesphere_api_url = const.CITESPHERE_API_URL\n",
"connector = CitesphereConnector(citesphere_api_url, auth_object)\n",
"#default max number of items displayed on a collection items page in citesphere\n",
"max_size=const.MAX_SIZE\n",
"# default max number of items displayed on a collection items page in citesphere\n",
"max_size = const.MAX_SIZE\n",
"\n",
"def get_file(file_id:str)-> str:\n",
" return const.GILES_URL+\"{}/content\".format(file_id)"
"\n",
"def get_file(file_id: str) -> str:\n",
" return const.GILES_URL + \"{}/content\".format(file_id)"
]
},
{
Expand Down Expand Up @@ -67,7 +67,7 @@
"metadata": {},
"outputs": [],
"source": [
"groups=connector.get_groups()"
"groups = connector.get_groups()"
]
},
{
Expand Down Expand Up @@ -107,35 +107,33 @@
"metadata": {},
"outputs": [],
"source": [
"#download files from the collection items\n",
"def download_files(folder_path:str,ids:set, citesphere_token:str) -> list:\n",
" \n",
" #stores paths to downloaded files\n",
"# download files from the collection items\n",
"def download_files(folder_path: str, ids: set, citesphere_token: str) -> list:\n",
" # stores paths to downloaded files\n",
" path_list = []\n",
" \n",
" #iterating through the ids list\n",
" for (file_id, file_name) in ids:\n",
" \n",
"\n",
" # iterating through the ids list\n",
" for file_id, file_name in ids:\n",
" # getting the file ur using giles file id\n",
" giles_url = get_file(file_id)\n",
" os.makedirs(folder_path, exist_ok=True)\n",
" filename = os.path.join(folder_path, f\"{file_name}\")\n",
" \n",
" #append the path of the saved file to the folder\n",
"\n",
" # append the path of the saved file to the folder\n",
" path_list.append(filename)\n",
" \n",
" #header for get request\n",
"\n",
" # header for get request\n",
" headers = {\n",
" \"Authorization\": f\"Bearer {citesphere_token}\",\n",
" \"Content-Type\": \"application/pdf;charset=UTF-8\"\n",
" }\n",
" \"Content-Type\": \"application/pdf;charset=UTF-8\",\n",
" }\n",
" response = requests.get(giles_url, headers=headers)\n",
" \n",
" #saving the file if retrieved successfully\n",
"\n",
" # saving the file if retrieved successfully\n",
" if response.status_code == 200:\n",
" with open(filename, \"wb\") as file:\n",
" file.write(response.content)\n",
" return path_list "
" return path_list"
]
},
{
Expand Down Expand Up @@ -226,8 +224,7 @@
"source": [
"# Create the CSV file with all the metadata and file path of the downloaded files\n",
"def write_to_csv(csv_name: str, item: list, flag: int) -> None:\n",
"\n",
" with open(csv_name, 'a', newline='') as file:\n",
" with open(csv_name, \"a\", newline=\"\") as file:\n",
" writer = csv.writer(file)\n",
"\n",
" # Check if it's the first time writing to the file\n",
Expand All @@ -236,7 +233,7 @@
" writer.writerow(fields)\n",
"\n",
" # Write the values to the CSV file\n",
" writer.writerow(list(item.values()))\n"
" writer.writerow(list(item.values()))"
]
},
{
Expand Down Expand Up @@ -275,8 +272,9 @@
"metadata": {},
"outputs": [],
"source": [
"def add_to_csv(csv_name: str, folder_name: str, items: list, csv_dict: dict, flag: int) -> int:\n",
"\n",
"def add_to_csv(\n",
" csv_name: str, folder_name: str, items: list, csv_dict: dict, flag: int\n",
") -> int:\n",
" for item in items[\"items\"]:\n",
" if item[\"key\"] in csv_dict:\n",
" continue\n",
Expand All @@ -291,16 +289,28 @@
" for values in items_list:\n",
" # Getting the file IDs in uploadedFile, extractedText, pages, image, text, ocr, additionalFiles\n",
" if values[\"uploadedFile\"] and values[\"uploadedFile\"] != \"None\":\n",
" giles_ids.add((values[\"uploadedFile\"][\"id\"], values[\"uploadedFile\"][\"filename\"]))\n",
" giles_ids.add(\n",
" (\n",
" values[\"uploadedFile\"][\"id\"],\n",
" values[\"uploadedFile\"][\"filename\"],\n",
" )\n",
" )\n",
"\n",
" # Check if extractedText is present and not equal to \"None\"\n",
" if values[\"extractedText\"] and values[\"extractedText\"] != \"None\":\n",
" giles_ids.add((values[\"extractedText\"][\"id\"], values[\"extractedText\"][\"filename\"]))\n",
" giles_ids.add(\n",
" (\n",
" values[\"extractedText\"][\"id\"],\n",
" values[\"extractedText\"][\"filename\"],\n",
" )\n",
" )\n",
"\n",
" # Check if pages is present and not equal to \"None\"\n",
" if values[\"pages\"] and values[\"pages\"] != \"None\":\n",
" for value in values[\"pages\"]:\n",
" giles_ids.add((value[\"image\"][\"id\"], value[\"image\"][\"filename\"]))\n",
" giles_ids.add(\n",
" (value[\"image\"][\"id\"], value[\"image\"][\"filename\"])\n",
" )\n",
" giles_ids.add((value[\"text\"][\"id\"], value[\"text\"][\"filename\"]))\n",
" giles_ids.add((value[\"ocr\"][\"id\"], value[\"ocr\"][\"filename\"]))\n",
"\n",
Expand All @@ -310,14 +320,16 @@
"\n",
" if giles_ids:\n",
" # store paths of the downloaded files to the path attribute\n",
" item[\"paths\"] = download_files(folder_name, giles_ids, auth_object.access_token)\n",
" item[\"paths\"] = download_files(\n",
" folder_name, giles_ids, auth_object.access_token\n",
" )\n",
"\n",
" # Add the item to csv_dict and write it to the CSV file\n",
" csv_dict[item[\"key\"]] = item\n",
" write_to_csv(csv_name, item, flag)\n",
" flag = 1\n",
"\n",
" return flag\n"
" return flag"
]
},
{
Expand Down Expand Up @@ -358,26 +370,29 @@
"outputs": [],
"source": [
"# Downloads and generates a CSV file containing all the group items information\n",
"def process_groups(csv_name: str, folder_path: str, groups: list, connector, max_size: int) -> dict:\n",
"\n",
"def process_groups(\n",
" csv_name: str, folder_path: str, groups: list, connector, max_size: int\n",
") -> dict:\n",
" csv_dict = {}\n",
" flag = 0\n",
" \n",
" #Iterate over the groups\n",
"\n",
" # Iterate over the groups\n",
" for group in groups:\n",
" group_id = group[\"id\"]\n",
" collections = connector.get_collections(group_id)\n",
" \n",
" #Iterate over the collections in the respective group\n",
"\n",
" # Iterate over the collections in the respective group\n",
" for collection in collections[\"collections\"]:\n",
" num_pages = math.ceil(collection[\"numberOfItems\"] / max_size)\n",
" \n",
" #Iterating over the pages\n",
"\n",
" # Iterating over the pages\n",
" for page in range(1, num_pages + 1):\n",
" items = connector.get_collection_items(group_id, collection[\"key\"], page)\n",
" items = connector.get_collection_items(\n",
" group_id, collection[\"key\"], page\n",
" )\n",
" flag = add_to_csv(csv_name, folder_path, items, csv_dict, flag)\n",
"\n",
" return csv_dict\n"
" return csv_dict"
]
},
{
Expand All @@ -391,9 +406,9 @@
"source": [
"csv_filename = \"citesphere_csv.csv\"\n",
"\n",
"folder_path = \"Files\"\n",
"folder_path = \"Files\"\n",
"\n",
"process_groups(csv_filename,folder_path,groups,connector,max_size)\n"
"process_groups(csv_filename, folder_path, groups, connector, max_size)"
]
},
{
Expand Down
Loading

0 comments on commit 0039256

Please sign in to comment.