From 3e9c187e052a2b6988424a5bc31af7e5f2a8425c Mon Sep 17 00:00:00 2001 From: leogott <61663141+leogott@users.noreply.github.com> Date: Fri, 25 Sep 2020 02:11:24 +0200 Subject: [PATCH 1/3] Move bodge to Java --- .../acoli/conll/rdf/CoNLLRDFAnnotator.java | 8 +- .../acoli/conll/rdf/CoNLLRDFFormatter.java | 98 ++++++++++--------- .../org/acoli/conll/rdf/CoNLLRDFUpdater.java | 20 ++-- 3 files changed, 67 insertions(+), 59 deletions(-) diff --git a/src/main/java/org/acoli/conll/rdf/CoNLLRDFAnnotator.java b/src/main/java/org/acoli/conll/rdf/CoNLLRDFAnnotator.java index 4bb3a7b..eb57642 100644 --- a/src/main/java/org/acoli/conll/rdf/CoNLLRDFAnnotator.java +++ b/src/main/java/org/acoli/conll/rdf/CoNLLRDFAnnotator.java @@ -62,8 +62,10 @@ public static void main(String[] args) throws IOException { while((line = in.readLine())!=null) { line=line.replaceAll("[\t ]+"," ").trim(); - if(!buffer.trim().equals("")) - if((line.startsWith("@") || line.startsWith("#")) && !lastLine.startsWith("@") && !lastLine.startsWith("#")) { + if(!buffer.trim().equals("") && + (line.startsWith("@") || line.startsWith("PREFIX") || line.startsWith("#")) && + !(lastLine.startsWith("@") || lastLine.startsWith("PREFIX") || lastLine.startsWith("#")) + ) { while(!command.trim().equals(">")) { System.err.print( "actions ............................................................................................................\n"+ @@ -108,7 +110,7 @@ public static void main(String[] args) throws IOException { command = ""; } //System.err.println(ANSI_RED+"> "+line+ANSI_RESET); - if(line.trim().startsWith("@") && !lastLine.trim().endsWith(".")) + if((line.trim().startsWith("@") || line.trim().startsWith("PREFIX")) && !lastLine.trim().endsWith(".")) //System.out.print("\n"); buffer=buffer+"\n"; diff --git a/src/main/java/org/acoli/conll/rdf/CoNLLRDFFormatter.java b/src/main/java/org/acoli/conll/rdf/CoNLLRDFFormatter.java index 7ed480b..11e402f 100644 --- a/src/main/java/org/acoli/conll/rdf/CoNLLRDFFormatter.java +++ b/src/main/java/org/acoli/conll/rdf/CoNLLRDFFormatter.java @@ -423,9 +423,9 @@ protected static String reorderTTLBuffer(String buffer, List cols) { String line; while((line=in.readLine())!=null) { line=line.trim(); - if(line.startsWith("@")) result=result+line+"\n"; else - if(line.startsWith("#")) result=result+line+"\n"; else - if(!line.equals("")) { + if(line.startsWith("@") || line.startsWith("PREFIX") || line.startsWith("#")) { + result=result+line+"\n"; + } else if(!line.equals("")) { //reorder columns according to user list. String orderedLine = ""; List statements = new ArrayList(Arrays.asList(line.substring(0, line.lastIndexOf(".")-1).split(";\\s*\t"))); //TODO: only consider ; not ";" @@ -665,62 +665,64 @@ protected void processSentenceStream() throws IOException { while((line = getInputStream().readLine())!=null) { line=line.replaceAll("[\t ]+"," ").trim(); - if(!buffer.trim().equals("")) - if((line.startsWith("@") || line.startsWith("#")) && !lastLine.startsWith("@") && !lastLine.startsWith("#")) { //!buffer.matches("@[^\n]*\n?$")) { - for (Module m:modules) { - if(m.getMode()==Mode.CONLLRDF) m.getOutputStream().println(reorderTTLBuffer(buffer, m.getCols())); - if(m.getMode()==Mode.DEBUG) System.err.println(colorTTL(reorderTTLBuffer(buffer, m.getCols()))); - if(m.getMode()==Mode.CONLL) { - if (m.getCols().size() < 1) {// no column args supplied - LOG.info("No column names in cmd args, searching rdf comments.."); - List conllColumns = findColumnNamesInRDFBuffer(buffer); + if(!buffer.trim().equals("") && + ((line.startsWith("@") || line.startsWith("PREFIX")) || line.startsWith("#")) && + !(lastLine.startsWith("@") || lastLine.startsWith("PREFIX") || lastLine.startsWith("#")) + ) { + for (Module m:modules) { + if(m.getMode()==Mode.CONLLRDF) m.getOutputStream().println(reorderTTLBuffer(buffer, m.getCols())); + if(m.getMode()==Mode.DEBUG) System.err.println(colorTTL(reorderTTLBuffer(buffer, m.getCols()))); + if(m.getMode()==Mode.CONLL) { + if (m.getCols().size() < 1) {// no column args supplied + LOG.info("No column names in cmd args, searching rdf comments.."); + List conllColumns = findColumnNamesInRDFBuffer(buffer); + if (conllColumns.size()>0) { + LOG.info("Using #global.comments from rdf"); + m.setCols(conllColumns); + } else { + LOG.info("Trying conll columns now.."); + conllColumns = CoNLLStreamExtractor.findFieldsFromComments(new BufferedReader(new StringReader(buffer.trim())), 1); if (conllColumns.size()>0) { - LOG.info("Using #global.comments from rdf"); m.setCols(conllColumns); - } else { - LOG.info("Trying conll columns now.."); - conllColumns = CoNLLStreamExtractor.findFieldsFromComments(new BufferedReader(new StringReader(buffer.trim())), 1); - if (conllColumns.size()>0) { - m.setCols(conllColumns); - } } } - if (m.getCols().size() < 1) { - LOG.info("Supply column names some way! (-conll arg, global.columns or rdf comments"); - } - else - printSparql(buffer, columnsAsSelect(m.getCols()), new OutputStreamWriter(m.getOutputStream())); } - if(m.getMode()==Mode.QUERY) printSparql(buffer, m.getSelect(), new OutputStreamWriter(m.getOutputStream())); - if(m.getMode()==Mode.GRAMMAR) m.getOutputStream().println(extractCoNLLGraph(buffer,true)); - if(m.getMode()==Mode.SEMANTICS) m.getOutputStream().println(extractTermGraph(buffer,true)); - if(m.getMode()==Mode.GRAMMAR_SEMANTICS) { - m.getOutputStream().println(extractCoNLLGraph(buffer,true)); - m.getOutputStream().println(extractTermGraph(buffer,false)); + if (m.getCols().size() < 1) { + LOG.info("Supply column names some way! (-conll arg, global.columns or rdf comments"); } + else + printSparql(buffer, columnsAsSelect(m.getCols()), new OutputStreamWriter(m.getOutputStream())); + } + if(m.getMode()==Mode.QUERY) printSparql(buffer, m.getSelect(), new OutputStreamWriter(m.getOutputStream())); + if(m.getMode()==Mode.GRAMMAR) m.getOutputStream().println(extractCoNLLGraph(buffer,true)); + if(m.getMode()==Mode.SEMANTICS) m.getOutputStream().println(extractTermGraph(buffer,true)); + if(m.getMode()==Mode.GRAMMAR_SEMANTICS) { + m.getOutputStream().println(extractCoNLLGraph(buffer,true)); + m.getOutputStream().println(extractTermGraph(buffer,false)); } - buffer=""; } - //System.err.println(ANSI_RED+"> "+line+ANSI_RESET); - if(line.trim().startsWith("@") && !lastLine.trim().endsWith(".")) - //System.out.print("\n"); - buffer=buffer+"\n"; + buffer=""; + } + //System.err.println(ANSI_RED+"> "+line+ANSI_RESET); + if((line.trim().startsWith("@") || line.trim().startsWith("PREFIX")) && !lastLine.trim().endsWith(".")) + //System.out.print("\n"); + buffer=buffer+"\n"; - if(line.trim().startsWith("#") && (!lastLine.trim().startsWith("#"))) - // System.out.print("\n"); - buffer=buffer+"\n"; - - //System.out.print(" "+color(line)); - //System.out.print(color(line)); - buffer=buffer+line+"\t";//+"\n"; + if(line.trim().startsWith("#") && (!lastLine.trim().startsWith("#"))) + // System.out.print("\n"); + buffer=buffer+"\n"; + + //System.out.print(" "+color(line)); + //System.out.print(color(line)); + buffer=buffer+line+"\t";//+"\n"; - if(line.trim().endsWith(".") || line.trim().matches("^(.*>)?[^<]*#")) - //System.out.print("\n"); - buffer=buffer+"\n"; + if(line.trim().endsWith(".") || line.trim().matches("^(.*>)?[^<]*#")) + //System.out.print("\n"); + buffer=buffer+"\n"; - //System.out.println(); - lastLine=line; - } + //System.out.println(); + lastLine=line; + } for (Module m:modules) { if(m.getMode()==Mode.CONLLRDF) m.getOutputStream().println(reorderTTLBuffer(buffer, m.getCols())); diff --git a/src/main/java/org/acoli/conll/rdf/CoNLLRDFUpdater.java b/src/main/java/org/acoli/conll/rdf/CoNLLRDFUpdater.java index 3cf53f0..1b1ba45 100644 --- a/src/main/java/org/acoli/conll/rdf/CoNLLRDFUpdater.java +++ b/src/main/java/org/acoli/conll/rdf/CoNLLRDFUpdater.java @@ -806,17 +806,21 @@ protected void processSentenceStream() throws IOException { String lastLine =""; String buffer=""; // List > dRTs = new ArrayList >(); // iterations and execution time of each update in seconds +// TODO Refactor @Leo while((line = getInputStream().readLine())!=null) { - line=line.replaceAll("[\t ]+"," ").trim(); + line=line.replaceAll("[\t ]+"," ").trim(); // TODO this will mess-up multiline strings with lines ending in whitespace - if(!buffer.trim().equals("") && (line.startsWith("@") || line.startsWith("#")) && !lastLine.startsWith("@") && !lastLine.startsWith("#")) { //!buffer.matches("@[^\n]*\n?$")) { - // If the buffer is not empty and the current line starts with @ or # - // and the previous line did not start with @ or # + if(!buffer.trim().equals("") && + (line.startsWith("@") || line.startsWith("#")) || (line.startsWith("PREFIX")) && + !(lastLine.startsWith("@") || lastLine.startsWith("#") || (line.startsWith("PREFIX"))) + ) { + // If the buffer is not empty and the current line starts with @ or # or PREFIX + // and the previous line did not start with @ or # or PREFIX // check if the buffer contains a ttl prefix - if (buffer.contains("@prefix")) { + if (buffer.contains("@prefix") || buffer.contains("PREFIX")) { prefixCache = new String(); for (String buffLine:buffer.split("\n")) { - if (buffLine.trim().startsWith("@prefix")) { + if (buffLine.trim().startsWith("@prefix") || buffLine.trim().startsWith("PREFIX")) { prefixCache += buffLine+"\n"; } } @@ -862,7 +866,7 @@ protected void processSentenceStream() throws IOException { } // FINAL SENTENCE (with prefixes if necessary) - if (!buffer.contains("@prefix")) { + if (!(buffer.contains("@prefix") || buffer.contains("PREFIX"))) { buffer = prefixCache+buffer; } @@ -975,7 +979,7 @@ private synchronized void flushOutputBuffer(PrintStream out) { if (prefixDeduplication) { String prefixCacheTMP = new String(); for (String buffLine:sentBufferOut.remove(0).split("\n")) { - if (buffLine.trim().startsWith("@prefix")) { + if (buffLine.trim().startsWith("@prefix") || buffLine.trim().startsWith("PREFIX")) { prefixCacheTMP += buffLine+"\n"; } else if (!buffLine.trim().isEmpty()) { outString += buffLine+"\n"; From 1b6e6231187a2c24ca5a9edc51f67e49cface61b Mon Sep 17 00:00:00 2001 From: leogott <61663141+leogott@users.noreply.github.com> Date: Thu, 3 Feb 2022 06:59:07 +0100 Subject: [PATCH 2/3] Remove hack in run.sh --- run.sh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/run.sh b/run.sh index bc73968..d51964c 100755 --- a/run.sh +++ b/run.sh @@ -48,11 +48,7 @@ mvn_exec_goal=("exec:java" "-Dexec.mainClass=org.acoli.conll.rdf.$class_name" "- all_args=("${mvn_args[@]}" "${mvn_compile_goal[@]}" "${mvn_exec_goal[@]}") -mvn "${all_args[@]}" | - # the following is a hack to allow CoNLLRDFUpdater to process output of CoNLLBrackets2RDF - # currently, CoNLLRDFUpdater supportes the historical Turtle 1.0 prefix only - # TODO: fix this in CoNLLRDFUpdater, this is a hack, only - sed -e s/'^[\t ]*PREFIX \(.*\)$'/'@prefix \1 .'/g +mvn "${all_args[@]}" # The following commands were pretty useful in debugging the code above # echo ${#mvn_exec_goal[@]} # echo the size of the array From 991197d865116770143b7f34e787416af0c154ea Mon Sep 17 00:00:00 2001 From: leogott <61663141+leogott@users.noreply.github.com> Date: Thu, 3 Feb 2022 16:28:17 +0100 Subject: [PATCH 3/3] Revert "Remove hack in run.sh" This reverts commit 1b6e6231187a2c24ca5a9edc51f67e49cface61b. --- run.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/run.sh b/run.sh index d51964c..bc73968 100755 --- a/run.sh +++ b/run.sh @@ -48,7 +48,11 @@ mvn_exec_goal=("exec:java" "-Dexec.mainClass=org.acoli.conll.rdf.$class_name" "- all_args=("${mvn_args[@]}" "${mvn_compile_goal[@]}" "${mvn_exec_goal[@]}") -mvn "${all_args[@]}" +mvn "${all_args[@]}" | + # the following is a hack to allow CoNLLRDFUpdater to process output of CoNLLBrackets2RDF + # currently, CoNLLRDFUpdater supportes the historical Turtle 1.0 prefix only + # TODO: fix this in CoNLLRDFUpdater, this is a hack, only + sed -e s/'^[\t ]*PREFIX \(.*\)$'/'@prefix \1 .'/g # The following commands were pretty useful in debugging the code above # echo ${#mvn_exec_goal[@]} # echo the size of the array