Skip to content

Commit

Permalink
Merge pull request #5 from Opt-On/transcript-parsing
Browse files Browse the repository at this point in the history
Transcript parsing
  • Loading branch information
ericzhang11101 authored Feb 6, 2025
2 parents 8e67719 + b44ec81 commit 0581945
Show file tree
Hide file tree
Showing 10 changed files with 286 additions and 0 deletions.
Empty file modified spring-boot/mvnw
100644 → 100755
Empty file.
13 changes: 13 additions & 0 deletions spring-boot/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,19 @@
<version>1.18.36</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.29</version>
</dependency>

<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<scope>runtime</scope>
<optional>true</optional>
</dependency>
</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;

import com.opton.spring_boot.transcript_parser.TranscriptParser;

@RestController
@RequestMapping("/transcript")
public class TranscriptController {
Expand All @@ -28,8 +30,10 @@ public ResponseEntity<String> handleFileUpload(@RequestParam("file") MultipartFi

try {
// TODO: Call transcript service
TranscriptParser.ParseTranscript(file);
return ResponseEntity.status(HttpStatus.OK).body("File uploaded successfully");
} catch (Exception e) {
System.err.println(e.getMessage());
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("Failed to process the PDF file");
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
package com.opton.spring_boot.transcript_parser;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.web.multipart.MultipartFile;

import com.opton.spring_boot.transcript_parser.types.Summary;
import com.opton.spring_boot.transcript_parser.types.TermSummary;
import com.opton.util.TermSeasonYearToId;


public class TranscriptParser {
private static Pattern courseRegex = Pattern.compile("([A-Z]{2,})\\x20{2,}(\\d{1,3}\\w*)\\x20{1,}.*\\n");
private static Pattern creditRegex = Pattern.compile("\\d\\.\\d{2}");
private static Pattern levelRegex = Pattern.compile("Level:\\s+(\\w{2,})");
private static Pattern studentIdRegex = Pattern.compile("Student ID:\\s+(\\d+)");
private static Pattern termRegex = Pattern.compile("(?m)^\\s*(Fall|Winter|Spring)\\s+(\\d{4})\\s*$");

public static boolean IsTransferCredit(String courseLine){
Matcher regex = courseRegex.matcher(courseLine);
return regex.matches();
}

public static List<int[]> findAllStringSubmatchIndex(Matcher matcher) {
List<int[]> results = new ArrayList<>();

while (matcher.find()) {
// Create an array to store the start and end indices of the full match and capturing groups
int[] matchIndices = new int[matcher.groupCount() * 2 + 2];
matchIndices[0] = matcher.start(); // Start index of the full match
matchIndices[1] = matcher.end(); // End index of the full match

// Add start and end indices of capturing groups
for (int i = 1; i <= matcher.groupCount(); i++) {
matchIndices[i * 2] = matcher.start(i); // Start index of the capturing group
matchIndices[i * 2 + 1] = matcher.end(i); // End index of the capturing group
}

results.add(matchIndices);
}

return results;
}

// todo: not void
public static Summary ParseTranscript(MultipartFile file) throws Exception{
String transcriptData = PDFToText(file);
ArrayList <TermSummary> termSummaries = extractTermSummaries(transcriptData);
int studentNumber = extractStudentNumber(transcriptData);
String programName = extractProgramName(transcriptData);

Summary summary = new Summary();
summary.studentNumber = studentNumber;
summary.programName = programName;
summary.termSummaries = termSummaries;

System.out.println(programName);
System.out.println(studentNumber);

return summary;
}

@SuppressWarnings("null")
public static String PDFToText(MultipartFile file) throws Exception{
if (file.isEmpty() || !file.getOriginalFilename().endsWith(".pdf")) {
System.err.println("invalid pdf transcript uploaded");
throw new Exception("bad file");
}

try (PDDocument document = PDDocument.load(file.getInputStream())) {
PDFTextStripper pdfStripper = new PDFTextStripper();
String text = pdfStripper.getText(document);
// System.err.println(text);
return text;
} catch (IOException e) {
e.printStackTrace();
System.err.println("failed to parse pdf");
throw new Exception("can't parse");
}
}

// courseLine is of one of the following forms:
//
// ECON 102 Macroeconomics 0.50 0.50 98
// ECON 102 Macroeconomics
// ECON 102 Macroeconomics 0.50
//
// Those are, in order: past term course, current term course, transfer credit.
// isTransferCredit should return true only for the last case.
public static boolean isTransferCredit(String courseLine) {
ArrayList <String> matches =new ArrayList<>();
Matcher gradeMatcher = creditRegex.matcher(courseLine);
while (gradeMatcher.find()){
matches.add(gradeMatcher.group());
}
return matches.size() == 1;
}

public static ArrayList<TermSummary> extractTermSummaries(String text) throws Exception{
Matcher termMatcher = termRegex.matcher(text);
Matcher levelMatcher = levelRegex.matcher(text);
Matcher courseMatcher = courseRegex.matcher(text);

// iterate through the matches, if len not same throw exception
List <int[]> termMatches = findAllStringSubmatchIndex(termMatcher);
List <int[]> levelMatches = findAllStringSubmatchIndex(levelMatcher);
List <int[]> courseMatches = findAllStringSubmatchIndex(courseMatcher);

if (termMatches.size() != levelMatches.size()){
throw new Exception("num terms != num levels");
}

// ArrayList<String> courseList = new ArrayList<>();
ArrayList <TermSummary> termSummaries = new ArrayList<>();

int j = 0;
for (int i = 0; i < termMatches.size(); i++){
String season = text.substring(termMatches.get(i)[2], termMatches.get(i)[3]);
String year = text.substring(termMatches.get(i)[4], termMatches.get(i)[5]);

int termCode = TermSeasonYearToId.termSeasonYearToId(season, year);
String level = text.substring(levelMatches.get(i)[2], levelMatches.get(i)[3]);

TermSummary termSummary = new TermSummary();
termSummary.level = level;
termSummary.termId = termCode;
termSummary.courses = new ArrayList<>();

for (
; j < courseMatches.size()
&& (i == termMatches.size() - 1 || courseMatches.get(j)[0] < termMatches.get(i+1)[0])
; j++
){
if (isTransferCredit(text.substring(courseMatches.get(j)[0], courseMatches.get(j)[1]))){
continue;
}
// course info
String department = text.substring(courseMatches.get(j)[2], courseMatches.get(j)[3]);
String number = text.substring(courseMatches.get(j)[4], courseMatches.get(j)[5]);
String course = (department + number).toLowerCase();
// TODO: use some data type for courses (vs combined string)
termSummary.courses.add((course).toLowerCase());
}
// termSummary.courses = ...
termSummaries.add(termSummary);
}

return termSummaries;

}

public static int extractStudentNumber(String text) throws IllegalArgumentException{
Matcher studentNumberMatcher = studentIdRegex.matcher(text);

if (!studentNumberMatcher.find()){
throw new IllegalArgumentException("no student id");
}

String studentNumberStr = studentNumberMatcher.group(1);

// Convert the student number to an integer
try {
return Integer.parseInt(studentNumberStr);
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Invalid student number: " + studentNumberStr);
}
}

public static String extractProgramName(String text) throws IllegalArgumentException {
// Find the start index of "Program:"
int start = text.lastIndexOf("Program:");
if (start == -1) {
throw new IllegalArgumentException("Program name not found");
}

// Skip "Program:" to get the start of the program name
start += 8;

// Find the end of the program name (delimited by ',' or '\n')
for (int end = start; end < text.length(); end++) {
char ch = text.charAt(end);
if (ch == ',' || ch == '\n') {
// Extract and trim the program name
return text.substring(start, end).trim();
}
}

// If no delimiter is found, throw an error
throw new IllegalArgumentException("Unexpected end of transcript");
}

}

Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package com.opton.spring_boot.transcript_parser.types;

import java.util.ArrayList;

public class Summary {
// Student number
public int studentNumber;

// Program name
public String programName;

// List of TermSummaries
public ArrayList<TermSummary> termSummaries;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package com.opton.spring_boot.transcript_parser.types;
import java.util.ArrayList;

public class TermSummary {
// Term ids are numbers of the form 1189 (Fall 2018)
public int termId;

// Levels are similar to 1A, 5C (delayed graduation)
public String level;

// Course codes are similar to CS 145, STAT 920, PD 1, CHINA 120R
public ArrayList<String> courses;
}
42 changes: 42 additions & 0 deletions spring-boot/src/main/java/com/opton/util/TermSeasonYearToId.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package com.opton.util;

public class TermSeasonYearToId {

public static int termSeasonYearToId(String maybeSeason, String maybeYear) throws IllegalArgumentException {
int month;
switch (maybeSeason) {
case "Fall":
month = 9;
break;
case "Spring":
month = 5;
break;
case "Winter":
month = 1;
break;
default:
System.out.println(maybeSeason + "not season");
throw new IllegalArgumentException("Not a season: " + maybeSeason);
}

int year;
try {
year = Integer.parseInt(maybeYear);
} catch (NumberFormatException e) {
System.out.println("not year");
throw new IllegalArgumentException("Not a year: " + maybeYear);
}

return (year - 1900) * 10 + month;
}

public static void main(String[] args) {
// Example usage
try {
int id = termSeasonYearToId("Fall", "2023");
System.out.println("Generated ID: " + id);
} catch (IllegalArgumentException e) {
System.out.println("Error: " + e.getMessage());
}
}
}
2 changes: 2 additions & 0 deletions spring-boot/src/main/resources/application.properties
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
spring.application.name=spring-boot
spring.devtools.restart.additional-paths=src/main/java
spring.devtools.restart.exclude=static/**,public/**

0 comments on commit 0581945

Please sign in to comment.