-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from Opt-On/transcript-parsing
Transcript parsing
- Loading branch information
Showing
10 changed files
with
286 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
198 changes: 198 additions & 0 deletions
198
spring-boot/src/main/java/com/opton/spring_boot/transcript_parser/TranscriptParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
package com.opton.spring_boot.transcript_parser; | ||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
import org.apache.pdfbox.pdmodel.PDDocument; | ||
import org.apache.pdfbox.text.PDFTextStripper; | ||
import org.springframework.web.multipart.MultipartFile; | ||
|
||
import com.opton.spring_boot.transcript_parser.types.Summary; | ||
import com.opton.spring_boot.transcript_parser.types.TermSummary; | ||
import com.opton.util.TermSeasonYearToId; | ||
|
||
|
||
public class TranscriptParser { | ||
private static Pattern courseRegex = Pattern.compile("([A-Z]{2,})\\x20{2,}(\\d{1,3}\\w*)\\x20{1,}.*\\n"); | ||
private static Pattern creditRegex = Pattern.compile("\\d\\.\\d{2}"); | ||
private static Pattern levelRegex = Pattern.compile("Level:\\s+(\\w{2,})"); | ||
private static Pattern studentIdRegex = Pattern.compile("Student ID:\\s+(\\d+)"); | ||
private static Pattern termRegex = Pattern.compile("(?m)^\\s*(Fall|Winter|Spring)\\s+(\\d{4})\\s*$"); | ||
|
||
public static boolean IsTransferCredit(String courseLine){ | ||
Matcher regex = courseRegex.matcher(courseLine); | ||
return regex.matches(); | ||
} | ||
|
||
public static List<int[]> findAllStringSubmatchIndex(Matcher matcher) { | ||
List<int[]> results = new ArrayList<>(); | ||
|
||
while (matcher.find()) { | ||
// Create an array to store the start and end indices of the full match and capturing groups | ||
int[] matchIndices = new int[matcher.groupCount() * 2 + 2]; | ||
matchIndices[0] = matcher.start(); // Start index of the full match | ||
matchIndices[1] = matcher.end(); // End index of the full match | ||
|
||
// Add start and end indices of capturing groups | ||
for (int i = 1; i <= matcher.groupCount(); i++) { | ||
matchIndices[i * 2] = matcher.start(i); // Start index of the capturing group | ||
matchIndices[i * 2 + 1] = matcher.end(i); // End index of the capturing group | ||
} | ||
|
||
results.add(matchIndices); | ||
} | ||
|
||
return results; | ||
} | ||
|
||
// todo: not void | ||
public static Summary ParseTranscript(MultipartFile file) throws Exception{ | ||
String transcriptData = PDFToText(file); | ||
ArrayList <TermSummary> termSummaries = extractTermSummaries(transcriptData); | ||
int studentNumber = extractStudentNumber(transcriptData); | ||
String programName = extractProgramName(transcriptData); | ||
|
||
Summary summary = new Summary(); | ||
summary.studentNumber = studentNumber; | ||
summary.programName = programName; | ||
summary.termSummaries = termSummaries; | ||
|
||
System.out.println(programName); | ||
System.out.println(studentNumber); | ||
|
||
return summary; | ||
} | ||
|
||
@SuppressWarnings("null") | ||
public static String PDFToText(MultipartFile file) throws Exception{ | ||
if (file.isEmpty() || !file.getOriginalFilename().endsWith(".pdf")) { | ||
System.err.println("invalid pdf transcript uploaded"); | ||
throw new Exception("bad file"); | ||
} | ||
|
||
try (PDDocument document = PDDocument.load(file.getInputStream())) { | ||
PDFTextStripper pdfStripper = new PDFTextStripper(); | ||
String text = pdfStripper.getText(document); | ||
// System.err.println(text); | ||
return text; | ||
} catch (IOException e) { | ||
e.printStackTrace(); | ||
System.err.println("failed to parse pdf"); | ||
throw new Exception("can't parse"); | ||
} | ||
} | ||
|
||
// courseLine is of one of the following forms: | ||
// | ||
// ECON 102 Macroeconomics 0.50 0.50 98 | ||
// ECON 102 Macroeconomics | ||
// ECON 102 Macroeconomics 0.50 | ||
// | ||
// Those are, in order: past term course, current term course, transfer credit. | ||
// isTransferCredit should return true only for the last case. | ||
public static boolean isTransferCredit(String courseLine) { | ||
ArrayList <String> matches =new ArrayList<>(); | ||
Matcher gradeMatcher = creditRegex.matcher(courseLine); | ||
while (gradeMatcher.find()){ | ||
matches.add(gradeMatcher.group()); | ||
} | ||
return matches.size() == 1; | ||
} | ||
|
||
public static ArrayList<TermSummary> extractTermSummaries(String text) throws Exception{ | ||
Matcher termMatcher = termRegex.matcher(text); | ||
Matcher levelMatcher = levelRegex.matcher(text); | ||
Matcher courseMatcher = courseRegex.matcher(text); | ||
|
||
// iterate through the matches, if len not same throw exception | ||
List <int[]> termMatches = findAllStringSubmatchIndex(termMatcher); | ||
List <int[]> levelMatches = findAllStringSubmatchIndex(levelMatcher); | ||
List <int[]> courseMatches = findAllStringSubmatchIndex(courseMatcher); | ||
|
||
if (termMatches.size() != levelMatches.size()){ | ||
throw new Exception("num terms != num levels"); | ||
} | ||
|
||
// ArrayList<String> courseList = new ArrayList<>(); | ||
ArrayList <TermSummary> termSummaries = new ArrayList<>(); | ||
|
||
int j = 0; | ||
for (int i = 0; i < termMatches.size(); i++){ | ||
String season = text.substring(termMatches.get(i)[2], termMatches.get(i)[3]); | ||
String year = text.substring(termMatches.get(i)[4], termMatches.get(i)[5]); | ||
|
||
int termCode = TermSeasonYearToId.termSeasonYearToId(season, year); | ||
String level = text.substring(levelMatches.get(i)[2], levelMatches.get(i)[3]); | ||
|
||
TermSummary termSummary = new TermSummary(); | ||
termSummary.level = level; | ||
termSummary.termId = termCode; | ||
termSummary.courses = new ArrayList<>(); | ||
|
||
for ( | ||
; j < courseMatches.size() | ||
&& (i == termMatches.size() - 1 || courseMatches.get(j)[0] < termMatches.get(i+1)[0]) | ||
; j++ | ||
){ | ||
if (isTransferCredit(text.substring(courseMatches.get(j)[0], courseMatches.get(j)[1]))){ | ||
continue; | ||
} | ||
// course info | ||
String department = text.substring(courseMatches.get(j)[2], courseMatches.get(j)[3]); | ||
String number = text.substring(courseMatches.get(j)[4], courseMatches.get(j)[5]); | ||
String course = (department + number).toLowerCase(); | ||
// TODO: use some data type for courses (vs combined string) | ||
termSummary.courses.add((course).toLowerCase()); | ||
} | ||
// termSummary.courses = ... | ||
termSummaries.add(termSummary); | ||
} | ||
|
||
return termSummaries; | ||
|
||
} | ||
|
||
public static int extractStudentNumber(String text) throws IllegalArgumentException{ | ||
Matcher studentNumberMatcher = studentIdRegex.matcher(text); | ||
|
||
if (!studentNumberMatcher.find()){ | ||
throw new IllegalArgumentException("no student id"); | ||
} | ||
|
||
String studentNumberStr = studentNumberMatcher.group(1); | ||
|
||
// Convert the student number to an integer | ||
try { | ||
return Integer.parseInt(studentNumberStr); | ||
} catch (NumberFormatException e) { | ||
throw new IllegalArgumentException("Invalid student number: " + studentNumberStr); | ||
} | ||
} | ||
|
||
public static String extractProgramName(String text) throws IllegalArgumentException { | ||
// Find the start index of "Program:" | ||
int start = text.lastIndexOf("Program:"); | ||
if (start == -1) { | ||
throw new IllegalArgumentException("Program name not found"); | ||
} | ||
|
||
// Skip "Program:" to get the start of the program name | ||
start += 8; | ||
|
||
// Find the end of the program name (delimited by ',' or '\n') | ||
for (int end = start; end < text.length(); end++) { | ||
char ch = text.charAt(end); | ||
if (ch == ',' || ch == '\n') { | ||
// Extract and trim the program name | ||
return text.substring(start, end).trim(); | ||
} | ||
} | ||
|
||
// If no delimiter is found, throw an error | ||
throw new IllegalArgumentException("Unexpected end of transcript"); | ||
} | ||
|
||
} | ||
|
Binary file added
BIN
+7.24 KB
...oot/src/main/java/com/opton/spring_boot/transcript_parser/test_data/transcript-simple.pdf
Binary file not shown.
Binary file added
BIN
+9.44 KB
...t/src/main/java/com/opton/spring_boot/transcript_parser/test_data/transcript-transfer.pdf
Binary file not shown.
14 changes: 14 additions & 0 deletions
14
spring-boot/src/main/java/com/opton/spring_boot/transcript_parser/types/Summary.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
package com.opton.spring_boot.transcript_parser.types; | ||
|
||
import java.util.ArrayList; | ||
|
||
public class Summary { | ||
// Student number | ||
public int studentNumber; | ||
|
||
// Program name | ||
public String programName; | ||
|
||
// List of TermSummaries | ||
public ArrayList<TermSummary> termSummaries; | ||
} |
13 changes: 13 additions & 0 deletions
13
spring-boot/src/main/java/com/opton/spring_boot/transcript_parser/types/TermSummary.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
package com.opton.spring_boot.transcript_parser.types; | ||
import java.util.ArrayList; | ||
|
||
public class TermSummary { | ||
// Term ids are numbers of the form 1189 (Fall 2018) | ||
public int termId; | ||
|
||
// Levels are similar to 1A, 5C (delayed graduation) | ||
public String level; | ||
|
||
// Course codes are similar to CS 145, STAT 920, PD 1, CHINA 120R | ||
public ArrayList<String> courses; | ||
} |
42 changes: 42 additions & 0 deletions
42
spring-boot/src/main/java/com/opton/util/TermSeasonYearToId.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
package com.opton.util; | ||
|
||
public class TermSeasonYearToId { | ||
|
||
public static int termSeasonYearToId(String maybeSeason, String maybeYear) throws IllegalArgumentException { | ||
int month; | ||
switch (maybeSeason) { | ||
case "Fall": | ||
month = 9; | ||
break; | ||
case "Spring": | ||
month = 5; | ||
break; | ||
case "Winter": | ||
month = 1; | ||
break; | ||
default: | ||
System.out.println(maybeSeason + "not season"); | ||
throw new IllegalArgumentException("Not a season: " + maybeSeason); | ||
} | ||
|
||
int year; | ||
try { | ||
year = Integer.parseInt(maybeYear); | ||
} catch (NumberFormatException e) { | ||
System.out.println("not year"); | ||
throw new IllegalArgumentException("Not a year: " + maybeYear); | ||
} | ||
|
||
return (year - 1900) * 10 + month; | ||
} | ||
|
||
public static void main(String[] args) { | ||
// Example usage | ||
try { | ||
int id = termSeasonYearToId("Fall", "2023"); | ||
System.out.println("Generated ID: " + id); | ||
} catch (IllegalArgumentException e) { | ||
System.out.println("Error: " + e.getMessage()); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,3 @@ | ||
spring.application.name=spring-boot | ||
spring.devtools.restart.additional-paths=src/main/java | ||
spring.devtools.restart.exclude=static/**,public/** |