Skip to content

Commit 7012aa4

Browse files
authored
Feature/normalization option (#1479)
* Added option for normalization. * Enabled normalization for java and cpp2 * Updated help text in documentation. * Rough implementation of language specific normalization option. * The normalization flag is now passed to the language modules. * Implemented simple test for the Language interface * Changed the Language interface to force all Language to implement the parse(Set, boolean) function. * Fixed error in ScalaLanguage due to Language interface change. * Fixed error in ScalaLanguageTest due to Language interface change.
1 parent 0f2a6c9 commit 7012aa4

File tree

25 files changed

+113
-46
lines changed

25 files changed

+113
-46
lines changed

cli/src/main/java/de/jplag/cli/CLI.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ public JPlagOptions buildOptionsFromArguments(ParseResult parseResult) throws Cl
182182
JPlagOptions jPlagOptions = new JPlagOptions(loadLanguage(parseResult), this.options.minTokenMatch, submissionDirectories,
183183
oldSubmissionDirectories, null, this.options.advanced.subdirectory, suffixes, this.options.advanced.exclusionFileName,
184184
JPlagOptions.DEFAULT_SIMILARITY_METRIC, this.options.advanced.similarityThreshold, this.options.shownComparisons, clusteringOptions,
185-
this.options.advanced.debug, mergingOptions);
185+
this.options.advanced.debug, mergingOptions, this.options.normalize);
186186

187187
String baseCodePath = this.options.baseCode;
188188
File baseCodeDirectory = baseCodePath == null ? null : new File(baseCodePath);

cli/src/main/java/de/jplag/cli/CliOptions.java

+3
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ public class CliOptions implements Runnable {
6363
@ArgGroup(validate = false, heading = "Merging of neighboring matches to increase the similarity of concealed plagiarism:%n")
6464
public Merging merging = new Merging();
6565

66+
@Option(names = {"--normalize"}, description = "Activate the normalization of tokens. Supported for languages: Java, C++.")
67+
public boolean normalize = false;
68+
6669
/**
6770
* Empty run method, so picocli prints help automatically
6871
*/

core/src/main/java/de/jplag/JPlag.java

+7
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ public static JPlagResult run(JPlagOptions options) throws ExitException {
7171
// Parse and validate submissions.
7272
SubmissionSetBuilder builder = new SubmissionSetBuilder(options);
7373
SubmissionSet submissionSet = builder.buildSubmissionSet();
74+
if (options.normalize() && options.language().supportsNormalization() && options.language().requiresCoreNormalization()) {
75+
submissionSet.normalizeSubmissions();
76+
}
7477
int submissionCount = submissionSet.numberOfSubmissions();
7578
if (submissionCount < 2)
7679
throw new SubmissionException("Not enough valid submissions! (found " + submissionCount + " valid submissions)");
@@ -103,6 +106,10 @@ private static void logSkippedSubmissions(SubmissionSet submissionSet, JPlagOpti
103106
}
104107

105108
private static void checkForConfigurationConsistency(JPlagOptions options) throws RootDirectoryException {
109+
if (options.normalize() && !options.language().supportsNormalization()) {
110+
logger.error(String.format("The language %s cannot be used with normalization.", options.language().getName()));
111+
}
112+
106113
List<String> duplicateNames = getDuplicateSubmissionFolderNames(options);
107114
if (duplicateNames.size() > 0) {
108115
throw new RootDirectoryException(String.format("Duplicate root directory names found: %s", String.join(", ", duplicateNames)));

core/src/main/java/de/jplag/Submission.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ private static File createErrorDirectory(String... subdirectoryNames) {
241241
* Parse files of the submission.
242242
* @return Whether parsing was successful.
243243
*/
244-
/* package-private */ boolean parse(boolean debugParser) {
244+
/* package-private */ boolean parse(boolean debugParser, boolean normalize) {
245245
if (files == null || files.isEmpty()) {
246246
logger.error("ERROR: nothing to parse for submission \"{}\"", name);
247247
tokenList = null;
@@ -250,7 +250,7 @@ private static File createErrorDirectory(String... subdirectoryNames) {
250250
}
251251

252252
try {
253-
tokenList = language.parse(new HashSet<>(files));
253+
tokenList = language.parse(new HashSet<>(files), normalize);
254254
if (logger.isDebugEnabled()) {
255255
for (Token token : tokenList) {
256256
logger.debug(String.join(" | ", token.getType().toString(), Integer.toString(token.getLine()), token.getSemantics().toString()));

core/src/main/java/de/jplag/SubmissionSet.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ private void parseAllSubmissions() throws ExitException {
119119
private void parseBaseCodeSubmission(Submission baseCode) throws BasecodeException {
120120
long startTime = System.currentTimeMillis();
121121
logger.trace("----- Parsing basecode submission: " + baseCode.getName());
122-
if (!baseCode.parse(options.debugParser())) {
122+
if (!baseCode.parse(options.debugParser(), options.normalize())) {
123123
throw new BasecodeException("Could not successfully parse basecode submission!");
124124
} else if (baseCode.getNumberOfTokens() < options.minimumTokenMatch()) {
125125
throw new BasecodeException(String.format("Basecode submission contains %d token(s), which is less than the minimum match length (%d)!",
@@ -150,7 +150,7 @@ private void parseSubmissions(List<Submission> submissions) {
150150
logger.trace("------ Parsing submission: " + submission.getName());
151151
currentSubmissionName = submission.getName();
152152

153-
if (!(ok = submission.parse(options.debugParser()))) {
153+
if (!(ok = submission.parse(options.debugParser(), options.normalize()))) {
154154
errors++;
155155
}
156156

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
package de.jplag.exceptions;
2+
3+
/**
4+
* Exceptions used if configuration is wrong.
5+
*/
6+
public class ConfigurationException extends ExitException {
7+
public ConfigurationException(String message) {
8+
super(message);
9+
}
10+
}

core/src/main/java/de/jplag/options/JPlagOptions.java

+26-18
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,8 @@ public record JPlagOptions(@JsonSerialize(using = LanguageSerializer.class) Lang
5555
@JsonProperty("subdirectory_name") String subdirectoryName, @JsonProperty("file_suffixes") List<String> fileSuffixes,
5656
@JsonProperty("exclusion_file_name") String exclusionFileName, @JsonProperty("similarity_metric") SimilarityMetric similarityMetric,
5757
@JsonProperty("similarity_threshold") double similarityThreshold, @JsonProperty("max_comparisons") int maximumNumberOfComparisons,
58-
@JsonProperty("cluster") ClusteringOptions clusteringOptions, boolean debugParser, @JsonProperty("merging") MergingOptions mergingOptions) {
58+
@JsonProperty("cluster") ClusteringOptions clusteringOptions, boolean debugParser, @JsonProperty("merging") MergingOptions mergingOptions,
59+
@JsonProperty("normalize") boolean normalize) {
5960

6061
public static final double DEFAULT_SIMILARITY_THRESHOLD = 0;
6162
public static final int DEFAULT_SHOWN_COMPARISONS = 500;
@@ -68,13 +69,13 @@ public record JPlagOptions(@JsonSerialize(using = LanguageSerializer.class) Lang
6869

6970
public JPlagOptions(Language language, Set<File> submissionDirectories, Set<File> oldSubmissionDirectories) {
7071
this(language, null, submissionDirectories, oldSubmissionDirectories, null, null, null, null, DEFAULT_SIMILARITY_METRIC,
71-
DEFAULT_SIMILARITY_THRESHOLD, DEFAULT_SHOWN_COMPARISONS, new ClusteringOptions(), false, new MergingOptions());
72+
DEFAULT_SIMILARITY_THRESHOLD, DEFAULT_SHOWN_COMPARISONS, new ClusteringOptions(), false, new MergingOptions(), false);
7273
}
7374

7475
public JPlagOptions(Language language, Integer minimumTokenMatch, Set<File> submissionDirectories, Set<File> oldSubmissionDirectories,
7576
File baseCodeSubmissionDirectory, String subdirectoryName, List<String> fileSuffixes, String exclusionFileName,
7677
SimilarityMetric similarityMetric, double similarityThreshold, int maximumNumberOfComparisons, ClusteringOptions clusteringOptions,
77-
boolean debugParser, MergingOptions mergingOptions) {
78+
boolean debugParser, MergingOptions mergingOptions, boolean normalize) {
7879
this.language = language;
7980
this.debugParser = debugParser;
8081
this.fileSuffixes = fileSuffixes == null || fileSuffixes.isEmpty() ? null : Collections.unmodifiableList(fileSuffixes);
@@ -89,90 +90,97 @@ public JPlagOptions(Language language, Integer minimumTokenMatch, Set<File> subm
8990
this.subdirectoryName = subdirectoryName;
9091
this.clusteringOptions = clusteringOptions;
9192
this.mergingOptions = mergingOptions;
93+
this.normalize = normalize;
9294
}
9395

9496
public JPlagOptions withLanguageOption(Language language) {
9597
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
9698
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
97-
clusteringOptions, debugParser, mergingOptions);
99+
clusteringOptions, debugParser, mergingOptions, normalize);
98100
}
99101

100102
public JPlagOptions withDebugParser(boolean debugParser) {
101103
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
102104
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
103-
clusteringOptions, debugParser, mergingOptions);
105+
clusteringOptions, debugParser, mergingOptions, normalize);
104106
}
105107

106108
public JPlagOptions withFileSuffixes(List<String> fileSuffixes) {
107109
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
108110
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
109-
clusteringOptions, debugParser, mergingOptions);
111+
clusteringOptions, debugParser, mergingOptions, normalize);
110112
}
111113

112114
public JPlagOptions withSimilarityThreshold(double similarityThreshold) {
113115
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
114116
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
115-
clusteringOptions, debugParser, mergingOptions);
117+
clusteringOptions, debugParser, mergingOptions, normalize);
116118
}
117119

118120
public JPlagOptions withMaximumNumberOfComparisons(int maximumNumberOfComparisons) {
119121
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
120122
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
121-
clusteringOptions, debugParser, mergingOptions);
123+
clusteringOptions, debugParser, mergingOptions, normalize);
122124
}
123125

124126
public JPlagOptions withSimilarityMetric(SimilarityMetric similarityMetric) {
125127
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
126128
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
127-
clusteringOptions, debugParser, mergingOptions);
129+
clusteringOptions, debugParser, mergingOptions, normalize);
128130
}
129131

130132
public JPlagOptions withMinimumTokenMatch(Integer minimumTokenMatch) {
131133
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
132134
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
133-
clusteringOptions, debugParser, mergingOptions);
135+
clusteringOptions, debugParser, mergingOptions, normalize);
134136
}
135137

136138
public JPlagOptions withExclusionFileName(String exclusionFileName) {
137139
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
138140
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
139-
clusteringOptions, debugParser, mergingOptions);
141+
clusteringOptions, debugParser, mergingOptions, normalize);
140142
}
141143

142144
public JPlagOptions withSubmissionDirectories(Set<File> submissionDirectories) {
143145
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
144146
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
145-
clusteringOptions, debugParser, mergingOptions);
147+
clusteringOptions, debugParser, mergingOptions, normalize);
146148
}
147149

148150
public JPlagOptions withOldSubmissionDirectories(Set<File> oldSubmissionDirectories) {
149151
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
150152
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
151-
clusteringOptions, debugParser, mergingOptions);
153+
clusteringOptions, debugParser, mergingOptions, normalize);
152154
}
153155

154156
public JPlagOptions withBaseCodeSubmissionDirectory(File baseCodeSubmissionDirectory) {
155157
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
156158
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
157-
clusteringOptions, debugParser, mergingOptions);
159+
clusteringOptions, debugParser, mergingOptions, normalize);
158160
}
159161

160162
public JPlagOptions withSubdirectoryName(String subdirectoryName) {
161163
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
162164
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
163-
clusteringOptions, debugParser, mergingOptions);
165+
clusteringOptions, debugParser, mergingOptions, normalize);
164166
}
165167

166168
public JPlagOptions withClusteringOptions(ClusteringOptions clusteringOptions) {
167169
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
168170
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
169-
clusteringOptions, debugParser, mergingOptions);
171+
clusteringOptions, debugParser, mergingOptions, normalize);
170172
}
171173

172174
public JPlagOptions withMergingOptions(MergingOptions mergingOptions) {
173175
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
174176
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
175-
clusteringOptions, debugParser, mergingOptions);
177+
clusteringOptions, debugParser, mergingOptions, normalize);
178+
}
179+
180+
public JPlagOptions withNormalize(boolean normalize) {
181+
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
182+
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
183+
clusteringOptions, debugParser, mergingOptions, normalize);
176184
}
177185

178186
public boolean hasBaseCode() {
@@ -264,7 +272,7 @@ public JPlagOptions(Language language, Integer minimumTokenMatch, File submissio
264272
boolean debugParser, MergingOptions mergingOptions) throws BasecodeException {
265273
this(language, minimumTokenMatch, Set.of(submissionDirectory), oldSubmissionDirectories,
266274
convertLegacyBaseCodeToFile(baseCodeSubmissionName, submissionDirectory), subdirectoryName, fileSuffixes, exclusionFileName,
267-
similarityMetric, similarityThreshold, maximumNumberOfComparisons, clusteringOptions, debugParser, mergingOptions);
275+
similarityMetric, similarityThreshold, maximumNumberOfComparisons, clusteringOptions, debugParser, mergingOptions, false);
268276
}
269277

270278
/**

language-antlr-utils/src/main/java/de/jplag/antlr/AbstractAntlrLanguage.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ protected AbstractAntlrLanguage() {
3434
}
3535

3636
@Override
37-
public List<Token> parse(Set<File> files) throws ParsingException {
37+
public List<Token> parse(Set<File> files, boolean normalize) throws ParsingException {
3838
if (this.parser == null) {
39-
this.parser = this.initializeParser();
39+
this.parser = this.initializeParser(normalize);
4040
}
4141

4242
return this.parser.parse(files);
@@ -46,7 +46,7 @@ public List<Token> parse(Set<File> files) throws ParsingException {
4646
* Lazily creates the parser. Has to be implemented, if no parser is passed in the constructor.
4747
* @return The newly initialized parser
4848
*/
49-
protected AbstractAntlrParserAdapter<?> initializeParser() {
49+
protected AbstractAntlrParserAdapter<?> initializeParser(boolean normalize) {
5050
throw new UnsupportedOperationException(
5151
String.format("The initializeParser method needs to be implemented for %s", this.getClass().getName()));
5252
}

language-antlr-utils/src/test/java/de/jplag/antlr/LanguageTest.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,19 @@ class LanguageTest {
2020
void testExceptionForNoDefinedParser() {
2121
LanguageWithoutParser lang = new LanguageWithoutParser();
2222
Set<File> emptySet = Set.of();
23-
assertThrows(UnsupportedOperationException.class, () -> lang.parse(emptySet));
23+
assertThrows(UnsupportedOperationException.class, () -> lang.parse(emptySet, false));
2424
}
2525

2626
@Test
2727
void testLanguageWithStaticParser() throws ParsingException {
2828
TestLanguage lang = new TestLanguage();
29-
Assertions.assertEquals(0, lang.parse(Set.of()).size());
29+
Assertions.assertEquals(0, lang.parse(Set.of(), false).size());
3030
}
3131

3232
@Test
3333
void testLanguageWithLazyParser() throws ParsingException {
3434
LanguageWithLazyParser lang = new LanguageWithLazyParser();
35-
Assertions.assertEquals(0, lang.parse(Set.of()).size());
35+
Assertions.assertEquals(0, lang.parse(Set.of(), false).size());
3636
}
3737

3838
private static class LanguageWithoutParser extends AbstractAntlrLanguage {
@@ -59,7 +59,7 @@ public int minimumTokenMatch() {
5959

6060
private static class LanguageWithLazyParser extends LanguageWithoutParser {
6161
@Override
62-
protected AbstractAntlrParserAdapter<?> initializeParser() {
62+
protected AbstractAntlrParserAdapter<?> initializeParser(boolean normalize) {
6363
return new TestParserAdapter();
6464
}
6565
}

language-api/src/main/java/de/jplag/Language.java

+30-2
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,25 @@ public interface Language {
3232
int minimumTokenMatch();
3333

3434
/**
35-
* Parses a set of files.
35+
* Parses a set of files. Override this method, if you don't require normalization.
3636
* @param files are the files to parse.
3737
* @return the list of parsed JPlag tokens.
3838
* @throws ParsingException if an error during parsing the files occurred.
39+
* @deprecated Replaced by {@link #parse(Set, boolean)}
3940
*/
40-
List<Token> parse(Set<File> files) throws ParsingException;
41+
@Deprecated(forRemoval = true)
42+
default List<Token> parse(Set<File> files) throws ParsingException {
43+
return parse(files, false);
44+
}
45+
46+
/**
47+
* Parses a set of files. Override this method, if you require normalization within the language module.
48+
* @param files are the files to parse.
49+
* @param normalize True, if the tokens should be normalized
50+
* @return the list of parsed JPlag tokens.
51+
* @throws ParsingException if an error during parsing the files occurred.
52+
*/
53+
List<Token> parse(Set<File> files, boolean normalize) throws ParsingException;
4154

4255
/**
4356
* Indicates whether the tokens returned by parse have semantic information added to them, i.e. whether the token
@@ -93,4 +106,19 @@ default boolean expectsSubmissionOrder() {
93106
default List<File> customizeSubmissionOrder(List<File> submissions) {
94107
return submissions;
95108
}
109+
110+
/**
111+
* @return True, if tokens for this language can be normalized
112+
*/
113+
default boolean supportsNormalization() {
114+
return false;
115+
}
116+
117+
/**
118+
* Override this method, if you need normalization within the language module, but not in the core module.
119+
* @return True, If the core normalization should be used.
120+
*/
121+
default boolean requiresCoreNormalization() {
122+
return true;
123+
}
96124
}

0 commit comments

Comments
 (0)