diff --git a/src/problem_bank_scripts/webwork_to_md.py b/src/problem_bank_scripts/webwork_to_md.py index 436b67d8..8bcebd3d 100644 --- a/src/problem_bank_scripts/webwork_to_md.py +++ b/src/problem_bank_scripts/webwork_to_md.py @@ -1,3 +1,13 @@ +""" +@Author: Parsa Rajabi (@parsa-rajabi) +@Created: 2021 +@Description: Converts webwork files from .PG to markdown .MD +Usage: + webwork_to_md.py +Arguments: + source_path Path to root of all the pg source files. + destination_path Path to destination of all md output files. +""" import os from pathlib import Path from pprint import pprint @@ -5,20 +15,31 @@ import re import time from shutil import copy2 +import sys +import logging +from docopt import docopt + +logging.basicConfig(filename='Webwork_to_md_logs.log', level=logging.INFO) +logging.info('Started Session') + +# read passed in arguments +#args = docopt(__doc__) + +# set source_path with passed in path +#source_path = args[''] +#destination_path = args[''] -# loop through every file in the dir -root_path = '../../webwork-open-problem-library/Contrib/BrockPhysics/College_Physics_Urone/' -root_dest_folder = '../../instructor_physics_bank/source_ww/output_md/College_Physics_Urone/' +#TODO: comment out the above lines to reactivate docopt, using this as a temporary workaround +source_path = '../../../webwork-open-problem-library/Contrib/BrockPhysics/College_Physics_Urone/' +destination_path = '../../../instructor_physics_bank/webwork/' # variable declaration counter = 0 source_files = [] -src_dirs = [] title = topic = author = editor = date = source = template_version = problem_type = attribution = outcomes = difficulty = randomization = taxonomy = "" tags = assets = altText = image_line = [] total_start_time = time.process_time() - # Variable declaration for Webwork keywords metadata_end_src = "DOCUMENT();" marcos_end_src = "TEXT(beginproblem());" @@ -34,15 +55,26 @@ context_src = "Context" partial_answer_src = "showPartialCorrectAnswers" -# extract file structure from source directory (handles ALL sub-directories) -# for loop runs based # of folders in src -for root, dirs, files in os.walk(root_path): - for name in dirs: - dest_folder = os.path.join(root, name).removeprefix(root_path) - src_dirs.append(root_dest_folder + dest_folder) + +def sanitize_file_path(file_path): + """ + description: sanitizes the file path to ensure it has a trailing slash at the end + @param file_path: + @return: file_path with trailing backslash + """ + # check if file_path doesn't end with a backslash + if file_path and file_path[-1] != '/': + # Add backslash to end of file_path + file_path += "/" + return file_path def split_file(file_content): + """ + description: splits the file into sections based on the keywords + @param file_content: + @return: dictionary of lists that contain problem parts + """ # TODO: once all functions are completed, convert global variables above into local variables # split the file into bite-size pieces to increase speed and reduce bugs metadata_content = file_content[:file_content.find(metadata_end_src)] @@ -62,6 +94,11 @@ def split_file(file_content): def metadata_extract(metadata_content): + """ + description: extracts metadata variables from the metadata section of the file + @param metadata_content: + @return: dictionary of metadata + """ metadata = "## " chapter_src = "DBchapter" section_src = "DBsection" @@ -97,6 +134,12 @@ def metadata_extract(metadata_content): def determine_problem_type(question_ans, filename): + """ + description: determines the type of problem + @param question_ans: + @param filename: + @return: dictionary of problem type + """ # determine what type of question is based on the ANS(type) numerical_type = "num_cmp" functional_type = "fun_cmp" @@ -133,6 +176,11 @@ def determine_problem_type(question_ans, filename): def server(question_solution): + """ + description: function bundles up the problem's solution in python code + @param question_solution: + @return: dictionary of server containing various elements such as import, generate and prepare. + """ # server variables server_imports = """ import random @@ -158,7 +206,21 @@ def server(question_solution): 'parse': server_parse, 'grade': server_grade} + def yaml_dump(directory_info, metadata, question_format, image_dic, question_text, question_units, question_parts, question_solution, destination_file_path): + """ + description: all problem sections are bundled up and dumped into a markdown file (created) + @param directory_info: + @param metadata: + @param question_format: + @param image_dic: + @param question_text: + @param question_units: + @param question_parts: + @param question_solution: + @param destination_file_path: + @return: none + """ # This solution is copied from this SO answer: https://stackoverflow.com/a/45004775/2217577 yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str @@ -208,13 +270,13 @@ def repr_str(dumper, data): Path(destination_file_path + directory_info['filename'] + ".md").write_text('---\n' + yaml.safe_dump(yaml_dict, sort_keys=False) + '---\n\n' - + '# {{ params.vars.title }} \n\n' + + '# {{ params.vars.title }} \n' + # Question image + ''.join(f'{image}\n' for image in question_images) - + ''.join(f'\n{question}\n' for part, question in zip(question_parts, question_text) if (part == 0)) - + ''.join(f'\n## Part {part} \n{question} \n\n\n ### Answer Section\n' for part, question in zip(question_parts, question_text) if (part > 0)) - + str(question_units) + '\n\n' - + '## pl-submission-panel \n\n\n' - + '## pl-answer-panel \n\n\n' + # Question body w/ final answer units + + ''.join(f'\n{question}\n \n### Answer Section\n{final_answer_unit}\n ' for part, question, final_answer_unit in zip(question_parts, question_text, question_units) if (part == 0)) + # Question part number and question body w/ final answer units (if question is multi-part) + + ''.join(f'\n## Part {part} \n{question}\n \n### Answer Section\n{final_answer_unit}\n' for part, question,final_answer_unit in zip(question_parts, question_text, question_units) if (part > 0)) + '## Rubric \n\n\n' + '## Solution \n\n\n' + '## Comments \n\n\n') @@ -222,14 +284,24 @@ def repr_str(dumper, data): def get_part_type(part_type): + """ + description: determines the type of each question part + @param part_type: + @return: dictionary containing type of question i.e numerical, text, etc. + """ return {"type": part_type, "pl-customizations": - {"weight": "1", - "hide-answer-panel": "true"} + {"weight": "1", + "hide-answer-panel": "true"} } def image_extract(question_content): + """ + description: extracts images from question content + @param question_content: + @return: dictionary containing image name and image alt text and image line containing both + """ image_src = "image(" image_line = [] image_alt_text = [] @@ -256,12 +328,19 @@ def image_extract(question_content): def problem_extract(question_body, image_alt_text): + """ + description: extracts the question text, parts and units from the question body + @param question_body: + @param image_alt_text: + @return: dictionary containing question text, parts and units + """ hint = '' - question_units = '' + question_unit = '' question_raw = [] question_split = '' part_headers = [] question_part = [] + multi_part_question_units = [] # split question into sections based on "$PAR" for question in question_body: @@ -280,7 +359,9 @@ def problem_extract(question_body, image_alt_text): if not hint or hint not in section_clean: subsection = help_problem_extract_ans_units(section_clean) subsection_text = subsection['section'] - question_units = subsection['final_ans_units'] + question_unit = subsection['final_ans_units'] + if len(question_unit) > 0: + multi_part_question_units.append(question_unit) subsection_multi_part = help_problem_extract_ans_type(subsection_text) subsection_multi_part_ans_type = subsection_multi_part['ans_type'] subsection_clean = subsection_multi_part['problem_clean'] @@ -291,35 +372,57 @@ def problem_extract(question_body, image_alt_text): return {'question_text': question_raw, 'question_parts': question_part, - 'question_units': question_units} + 'question_units': multi_part_question_units} def append_part_counter(part_counter, part_headers): + """ + description: outputs the unique question parts + @param part_counter: + @param part_headers: + @return: unique part counter + """ if part_counter not in part_headers: part_headers.append(part_counter) return part_headers def extract_problem_type(problem_subsection, filename): + """ + description: extracts the problem solution type from each problem subsection + @param problem_subsection: + @param filename: + @return: extract problem format and then call the determine_problem_type function + """ question_format_raw = re.findall("(ANS\(.+?\);)", str(problem_subsection)) return determine_problem_type(question_format_raw, filename) def help_problem_extract_ans_units(problem_subsection): - final_ans_units = '' + """ + description: extracts the final answer units and each section of the question + @param problem_subsection: + @return: dictionary containing question sections and final answer units + """ + question_final_units = '' section_clean = '' if not problem_subsection.startswith("\\{ image") and not problem_subsection.endswith(") \\}"): - # if section is the end i.e. ans_rule (determines the length of the answer) - if problem_subsection.startswith("\\{ans_rule") and problem_subsection.endswith("\\)"): - # extract the question units using regex - final_ans_units = re.findall('textrm{(.+?)}', problem_subsection) + # extract the question units using regex + final_ans_units = re.findall('\\\\} \\\\\(\\\\textrm{(.+?)}', problem_subsection) + if len(final_ans_units) == 1: + question_final_units = ''.join(final_ans_units) if not problem_subsection.startswith("\\{ans_rule") and not problem_subsection.endswith("\\)"): section_clean = problem_subsection return {'section': section_clean, - 'final_ans_units': final_ans_units} + 'final_ans_units': question_final_units} def help_problem_extract_ans_type(problem_subsection): + """ + description: extracts the question's answer type and returns the problem text without the answer type + @param problem_subsection: + @return: return dictionary containing question answer type and problem text without answer type + """ ans_type = [] problem_ans_type_removed = [] if problem_subsection.startswith("END_TEXT"): @@ -333,6 +436,12 @@ def help_problem_extract_ans_type(problem_subsection): def help_problem_extract_append(problem_subsection, final_dic): + """ + description: extracts the question text clean of any PEAL syntax and appends it to the final dictionary + @param problem_subsection: + @param final_dic: + @return: list that contains clean problems without any PEARL syntax in them + """ if len(problem_subsection) > 1: problem_stripped = problem_subsection.replace('\\', '').replace('textrm', '').replace('{', '').replace('}', '')\ .replace('·', '$\\cdot$').replace('END_TEXT', '').replace('BEGIN_TEXT', '').strip() @@ -346,6 +455,11 @@ def help_problem_extract_append(problem_subsection, final_dic): def extract_problem_solution(problem_solution): + """ + description: extracts the problem solution from the problem solution subsection + @param problem_solution: + @return: list containing problem solution + """ question_solution = [] for solution in problem_solution: @@ -361,29 +475,70 @@ def extract_problem_solution(problem_solution): return question_solution -# for loop runs based # of folders in src -for root, dirs, files in os.walk(root_path): - # create dest file structure based on source directory - for dir_path in src_dirs: - Path(dir_path).mkdir(parents=True, exist_ok=True) + +def progress(count, total, status=''): + """ + description: prints a dynamic progress bar source: https://gist.github.com/vladignatyev/06860ec2040cb497f0f3 + @param count: + @param total: + @param status: + """ + bar_len = 60 + filled_len = int(round(bar_len * count / float(total))) + + percents = round(100.0 * count / float(total), 1) + bar = '=' * filled_len + '-' * (bar_len - filled_len) + + sys.stdout.write('[%s] %s%s -- %s\r' % (bar, percents, '%', status)) + sys.stdout.flush() + +# -------------------------------------------------------------------------------------------------------------------- # + + +# sanitize source path to ensure it has a trailing backslash +source_path = sanitize_file_path(source_path) +# set root destination folder + +#TODO: Fix this so it's more robust by using pathlib! +root_dest_folder = sanitize_file_path(destination_path) + 'source/' + source_path.split('/')[-2] + '/' + +# Create root_dest_folder if it doesn't exist +Path(root_dest_folder).mkdir(parents=True, exist_ok=True) + +# iterate through all the files and dirs in the source directory +for root, dirs, files in os.walk(source_path): + # iterate through all the files in the current directory + for name in dirs: + dest_folder = os.path.join(root, name).removeprefix(source_path) + # create dest file structure based on source directory + Path(root_dest_folder + dest_folder).mkdir(parents=True, exist_ok=True) # iterate through each file for file in files: + # if file is a .pg file (PEAL) if file.endswith('.pg'): + # add file path to source_files list source_files.append(os.path.join(root, file)) - +# iterate through every .pg file found in the source directory for source_filepath in source_files: try: - dest_file_path = source_filepath[78:source_filepath.rfind('/')] + # start timer for processing file + file_start_time = time.process_time() + # extract and build information directory + dest_file_path = source_filepath[:source_filepath.rfind('/')].removeprefix(source_path) filename = source_filepath[source_filepath.rfind('/')+1:-3] folder_dir = source_filepath[:source_filepath.rfind('/')] - file_start_time = time.process_time() file_dir = source_filepath[source_filepath.find("Contrib"):] - question_file = open(source_filepath, 'r') - file_contents = question_file.read() - - file_contents_dic = split_file(file_contents) - metadata_dic = metadata_extract(file_contents_dic['metadata']) + """ + Example of dir_info output + { + 'filename': 'NU_U17-33-02-002', + 'file_dir': 'Contrib/BrockPhysics/College_Physics_Urone/33.Particle_Physics/33-02.Four_Basic_Forces/NU_U17-33-02-002.pg', + 'folder_dir': '../../../webwork-open-problem-library/Contrib/BrockPhysics/College_Physics_Urone/33.Particle_Physics/33-02.Four_Basic_Forces', + 'root_dest_folder': 'source/College_Physics_Urone/', + 'dest_file_path': '33.Particle_Physics/33-02.Four_Basic_Forces' + } + """ dir_info = { 'filename': filename, 'file_dir': file_dir, @@ -391,31 +546,51 @@ def extract_problem_solution(problem_solution): 'root_dest_folder': root_dest_folder, 'dest_file_path': dest_file_path } + # each question has a its own unique folder named after the file itself i.e question file NU_123.md is within NU_123 folder destination_file_path = root_dest_folder + dest_file_path + "/" + filename + "/" + Path(destination_file_path).mkdir(parents=True, exist_ok=True) + # open and read question file + question_file = open(source_filepath, 'r') + file_contents = question_file.read() + # split content of the question file into sections + file_contents_dic = split_file(file_contents) + # extract metadata from the question file + metadata_dic = metadata_extract(file_contents_dic['metadata']) + # extract question body from the question file question_body = file_contents_dic['question_body'] + # extract question images from the question body image_dic = image_extract(question_body) + # extract question item such as text, part #s, units from the question body question_extract = problem_extract(question_body, image_dic['image_alt_text']) question_text = question_extract['question_text'] question_parts = question_extract['question_parts'] question_units = question_extract['question_units'] + # determine question type question_formats = extract_problem_type(file_contents, dir_info['filename'])['question_type'] + # extract question solution from the question content question_solution = extract_problem_solution(file_contents_dic['question_solution']) - Path(destination_file_path).mkdir(parents=True, exist_ok=True) + # send all dictionaries to yaml_dump to create yaml files yaml_dump(dir_info, metadata_dic, question_formats, image_dic, question_text, question_units, question_parts, question_solution, destination_file_path) + # end timer for processing file end_file_time = time.process_time() + # calculate total time for processing file file_process_time = end_file_time - file_start_time - counterString = '#' + str(counter + 1) + ' - [' + str(round(file_process_time, 5)) + '] ' - currentFile = root_dest_folder + dest_file_path + "/" + filename + # print/update progress bar counter += 1 - print(counterString + currentFile) - + progress(counter, len(source_files), status="Files Processed: " + str(counter) + "/" + str(len(source_files))) except Exception as e: print(e) + logging.error('Error: ' + str(e)) pass # ------------------------ STATS ------------------------ # total_end_time = time.process_time() process_time_seconds = total_end_time - total_start_time +print('\n---') print('total time:', round(process_time_seconds / 60, 2), 'minutes,', round(process_time_seconds, 2), 'seconds') -print('avg time per each file:', round(process_time_seconds / counter, 2), 'seconds [', counter, '] files') +try: + print('avg time per file:', round(process_time_seconds / counter, 2), 'seconds [', counter, '] files') +except ZeroDivisionError: + print("Something went wrong, the counter is 0!") +logging.info('Session Completed')