From 3b4ee2c5f3cd03e36995d33e00b3130e0ef4963b Mon Sep 17 00:00:00 2001 From: Angelo Mario Del Grosso Date: Tue, 10 Jun 2014 21:06:03 +0200 Subject: [PATCH] pre clavius@work added SentencesHandler --- .../src/ilc/cnr/it/clavius/ClaviusMain.java | 144 +++++++++--------- .../it/clavius/constants/HandleConstants.java | 2 +- .../it/clavius/utils/SentencesHandler.java | 135 ++++++++++++++++ 3 files changed, 204 insertions(+), 77 deletions(-) create mode 100644 ClaviusLemmata/src/ilc/cnr/it/clavius/utils/SentencesHandler.java diff --git a/ClaviusLemmata/src/ilc/cnr/it/clavius/ClaviusMain.java b/ClaviusLemmata/src/ilc/cnr/it/clavius/ClaviusMain.java index 788706f..e99effb 100644 --- a/ClaviusLemmata/src/ilc/cnr/it/clavius/ClaviusMain.java +++ b/ClaviusLemmata/src/ilc/cnr/it/clavius/ClaviusMain.java @@ -8,23 +8,21 @@ import ilc.cnr.it.clavius.corpus.TreeBankHandler; import ilc.cnr.it.clavius.lemmata.ParseToken; -import ilc.cnr.it.clavius.lemmata.ParseXMLAnalized; + import ilc.cnr.it.clavius.utils.ClaviusUtils; +import ilc.cnr.it.clavius.utils.SentencesHandler; import ilc.cnr.it.clavius.utils.TextUtils; -import it.cnr.ilc.angelo.lemlat.LemLatQuery; -import it.cnr.ilc.angelo.lemlat.query.LemLatBaseSearch; -//import it.cnr.ilc.angelo.main.ParseToken; + import java.io.IOException; -import java.io.ObjectInputStream.GetField; -import java.util.List; + import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jdom2.Document; import org.jdom2.JDOMException; -import org.jdom2.Parent; + /** * @author angelodel80 @@ -190,84 +188,78 @@ public void setSentName(String sentName) { public static void main(String[] args) { - //ClaviusMain main1 = new ClaviusMain(); - - ClaviusMain lemmatizationRun = new ClaviusMain(); - - //main2.manageCorpus("ldt-1.5.xml"); - - TextHandler th = new TextHandler(); - - /* estrazione delle sentece da documenti TEI */ - - Map sentences = th.getSentences(HandleConstants.getXmlTeiFile()); - - // Funzionalità per la riscrittura del testo in fullText - lemmatizationRun.writeFullText(sentences, HandleConstants.getFullTextFile()); - - // Funzionalità per il PoS Tagging - Object[] sents = sentences.values().toArray(); - Object[] sKeys = sentences.keySet().toArray(); - - - for(int i = 0; i< sents.length; i++){ - lemmatizationRun.setMsg((String)sents[i]); - lemmatizationRun.setSentName(String.format("%s:: %s", (String)sKeys[i], lemmatizationRun.getMsg())); - System.out.println(lemmatizationRun.getSentName()); - // //FIXME costruire i token prima di processare le sentences con HUNPOS!!! - lemmatizationRun.process(HandleConstants.getModelforHunPos(), HandleConstants.getPathToHunPos()); - } - lemmatizationRun.writeOut(HandleConstants.getTaggedFile()); // Scrivo il contenuto del posTagging + ClaviusMain lemmatizationRun = new ClaviusMain(); + TextHandler th = new TextHandler(); + + /* estrazione delle sentece da documenti TEI */ + Map sentences = th.getSentences(HandleConstants.getXmlTeiFile()); + - /* processo per la ricerca del lemma nella banca dati formario-lemmario */ + /* Funzionalità per la scrittura del testo in fullText */ + lemmatizationRun.writeFullText(sentences, HandleConstants.getFullTextFile()); + + + /* Funzionalità per il PoS Tagging */ + Object[] sents = sentences.values().toArray(); + Object[] sKeys = sentences.keySet().toArray(); + + for(int i = 0; i< sents.length; i++){ + lemmatizationRun.setMsg((String)sents[i]); + lemmatizationRun.setSentName(String.format("%s:: %s", (String)sKeys[i], lemmatizationRun.getMsg())); + System.out.println(lemmatizationRun.getSentName()); + lemmatizationRun.process(HandleConstants.getModelforHunPos(), HandleConstants.getPathToHunPos()); + } - String[] ParseTokenArgs = {HandleConstants.getTaggedFile(), HandleConstants.getTabFileAnalyzed()}; - ParseToken.main(ParseTokenArgs); - - /*Processo per la costruzione del file XML*/ + lemmatizationRun.writeOut(HandleConstants.getTaggedFile()); // Scrivo il contenuto del posTagging + -// try { -// Document xmlSentences = TextUtils.TabToXml(HandleConstants.getTabFileAnalyzed(), true); -// ClaviusUtils.makeSentenceXML(xmlSentences); -// // TODO build XML for integration purposes (Tokens and Linguistical_Analysis) -// ClaviusUtils.makeIntegrationXMLforAnalysis(xmlSentences); -// } catch (JDOMException e) { -// // TODO Auto-generated catch block -// e.getMessage(); -// e.printStackTrace(); -// } catch (IOException e) { -// // TODO Auto-generated catch block -// e.printStackTrace(); -// } + /* processo per la ricerca del lemma nella banca dati formario-lemmario */ + ParseToken.main(new String[] {HandleConstants.getTaggedFile(), HandleConstants.getTabFileAnalyzed()}); + /*Processo per la costruzione del file XML*/ + try { + Document xmlSentences = TextUtils.TabToXml(HandleConstants.getTabFileAnalyzed(), true); + ClaviusUtils.makeSentenceXML(xmlSentences); + // TODO build XML for integration purposes (Tokens and Linguistical_Analysis) + ClaviusUtils.makeIntegrationXMLforAnalysis(xmlSentences); + SentencesHandler.main(new String[]{HandleConstants.getWorkDir(), HandleConstants.getLetterRif()}); + } catch (JDOMException e) { + e.getMessage(); + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + /* Processo per la query al LemLat */ -// ParseXMLAnalized par = new ParseXMLAnalized("C:/tmp/MP/VERG/VERG/Test1_an.xml"); -// StringBuilder lemLatBuider = new StringBuilder(); -// try { -// List tokens = par.extractTokens(); -// for (String token : tokens) { -// //System.out.println(token); -// lemLatBuider.append("TOKEN URI: " + token); -// String[] argLemLat = new String[1]; -// argLemLat[0] = token.replaceAll("(.+)@+?", "").replaceAll("\\[\\d\\]", "").toLowerCase(); -// lemLatBuider.append("\n\tTOKEN STRING: " + argLemLat[0]+"\n"); -// LemLatQuery.main(argLemLat); -// lemLatBuider.append(LemLatQuery.analysisStringBuider() + "\n"); -// //lemLatBuider.append("\t\tANALYSIS" + "\n"); -// lemLatBuider.append("\n"); -// TextUtils.StringToFile(lemLatBuider, "C:/tmp/MP/VERG/VERG/LemLat_17032014_an.txt"); -// } -// } catch (JDOMException e) { -// // TODO Auto-generated catch block -// e.printStackTrace(); -// } catch (IOException e) { -// // TODO Auto-generated catch block -// e.printStackTrace(); -// } + // ParseXMLAnalized par = new ParseXMLAnalized("C:/tmp/MP/VERG/VERG/Test1_an.xml"); + // StringBuilder lemLatBuider = new StringBuilder(); + // try { + // List tokens = par.extractTokens(); + // for (String token : tokens) { + // //System.out.println(token); + // lemLatBuider.append("TOKEN URI: " + token); + // String[] argLemLat = new String[1]; + // argLemLat[0] = token.replaceAll("(.+)@+?", "").replaceAll("\\[\\d\\]", "").toLowerCase(); + // lemLatBuider.append("\n\tTOKEN STRING: " + argLemLat[0]+"\n"); + // LemLatQuery.main(argLemLat); + // lemLatBuider.append(LemLatQuery.analysisStringBuider() + "\n"); + // //lemLatBuider.append("\t\tANALYSIS" + "\n"); + // lemLatBuider.append("\n"); + // TextUtils.StringToFile(lemLatBuider, "C:/tmp/MP/VERG/VERG/LemLat_17032014_an.txt"); + // } + // } catch (JDOMException e) { + // // TODO Auto-generated catch block + // e.printStackTrace(); + // } catch (IOException e) { + // // TODO Auto-generated catch block + // e.printStackTrace(); + // } + //ClaviusMain main1 = new ClaviusMain(); + //main1.manageCorpus("ldt-1.5.xml"); // main1.setMsg("fidelis dulcem amat virgo poetam"); diff --git a/ClaviusLemmata/src/ilc/cnr/it/clavius/constants/HandleConstants.java b/ClaviusLemmata/src/ilc/cnr/it/clavius/constants/HandleConstants.java index 43adb04..7839901 100644 --- a/ClaviusLemmata/src/ilc/cnr/it/clavius/constants/HandleConstants.java +++ b/ClaviusLemmata/src/ilc/cnr/it/clavius/constants/HandleConstants.java @@ -10,7 +10,7 @@ public class HandleConstants { private final static String letterRif = "147"; - private final static String TeiFile = "147_APUG_530_cc.129-130.xml"; + private final static String TeiFile = "147-transcription.xml"; private final static String workDir = "C:/tmp/Clavius/TEI-MarkUp/08042014/"+letterRif+"/"; //"C:/tmp/Clavius/TEI-MarkUp/08042014/"+letterRif+"/"; //"C:/tmp/Clavius/TEI-MarkUp/08042014/136/"; diff --git a/ClaviusLemmata/src/ilc/cnr/it/clavius/utils/SentencesHandler.java b/ClaviusLemmata/src/ilc/cnr/it/clavius/utils/SentencesHandler.java new file mode 100644 index 0000000..38b7ddf --- /dev/null +++ b/ClaviusLemmata/src/ilc/cnr/it/clavius/utils/SentencesHandler.java @@ -0,0 +1,135 @@ +/** + * + */ +package ilc.cnr.it.clavius.utils; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.util.List; + +import org.jdom2.Document; +import org.jdom2.Element; +import org.jdom2.input.SAXBuilder; +import org.jdom2.output.Format; +import org.jdom2.output.XMLOutputter; + +/** + * @author Angelo Del Grosso + * + */ +public class SentencesHandler { + + private static Document sentenceAnalysis; + /** + * + */ + public SentencesHandler() { + // TODO Auto-generated constructor stub + } + + + private static final boolean init(String file){ + boolean ret = false; + try { + SAXBuilder builder = new SAXBuilder(); + sentenceAnalysis = builder.build(file); + ret = true; + } catch (Exception e) { + // TODO: handle exception + e.printStackTrace(); + } + + return ret; + } + + private static final StringBuffer run(){ + StringBuffer ret = null; + if(null!=sentenceAnalysis){ + System.err.println(sentenceAnalysis.toString()); + Element root = sentenceAnalysis.getRootElement(); + List sents = root.getChildren(); + handleSentences(sents); + ret = outputSentences(sentenceAnalysis); + } + return ret; + } + + + private static StringBuffer outputSentences(Document sentenceAnalysis2) { + // TODO Auto-generated method stub + StringBuffer ret = null; + XMLOutputter out = new XMLOutputter(Format.getPrettyFormat()); + ret = new StringBuffer(out.outputString(sentenceAnalysis2)); + return ret; + } + + + private static void handleSentences(List sents) { + // TODO Auto-generated method stub + int count = 0; + for (Element sent : sents) { + List toks = sent.getChildren(); + Element firstTok = toks.get(0); + Element lastTok = toks.get(toks.size()-1); + int start = count+Integer.parseInt(firstTok.getAttributeValue("start")); + int end = count+Integer.parseInt(lastTok.getAttributeValue("end")); + + sent.setAttribute("start", String.valueOf(start)); + sent.setAttribute("end", String.valueOf(end)); + sent.setAttribute("span", firstTok.getAttributeValue("uri")+"-"+lastTok.getAttributeValue("uri")); + count = end+1; + //System.err.println(sent.toString()); + } + } + + + private static void BufferToFile(StringBuffer sentences, String file) { + // TODO Auto-generated method stub + BufferedWriter writer = null; + try{ + File outFile = new File(file); + writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFile), "UTF-8")); + writer.write(sentences.toString()); + + }catch(FileNotFoundException fe){ + fe.printStackTrace(); + }catch(IOException ioe){ + ioe.printStackTrace(); + }finally{ + try { + writer.flush(); + writer.close(); + } catch (IOException e) { + e.printStackTrace(); + } + + } + + } + + /** + * @param args + */ + public static void main(String[] args) { + // TODO Auto-generated method stub + StringBuffer sentences = null; + String file = String.format("%sLetter%s_an.xml", args[0],args[1]); + String fileOut = String.format("%sLetter%s_anOUT.xml", args[0],args[1]); + if(SentencesHandler.init(file)){ + sentences = SentencesHandler.run(); + System.out.println(sentences.toString()); + SentencesHandler.BufferToFile(sentences,fileOut); + } + else{ + System.out.println("nada!!"); + } + + + + } + +}