Skip to content

Commit

Permalink
pre clavius@work added SentencesHandler
Browse files Browse the repository at this point in the history
  • Loading branch information
Angelo Mario Del Grosso committed Jun 10, 2014
1 parent d6fc721 commit 3b4ee2c
Show file tree
Hide file tree
Showing 3 changed files with 204 additions and 77 deletions.
144 changes: 68 additions & 76 deletions ClaviusLemmata/src/ilc/cnr/it/clavius/ClaviusMain.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,21 @@
import ilc.cnr.it.clavius.corpus.TreeBankHandler;

import ilc.cnr.it.clavius.lemmata.ParseToken;
import ilc.cnr.it.clavius.lemmata.ParseXMLAnalized;

import ilc.cnr.it.clavius.utils.ClaviusUtils;
import ilc.cnr.it.clavius.utils.SentencesHandler;
import ilc.cnr.it.clavius.utils.TextUtils;
import it.cnr.ilc.angelo.lemlat.LemLatQuery;
import it.cnr.ilc.angelo.lemlat.query.LemLatBaseSearch;
//import it.cnr.ilc.angelo.main.ParseToken;


import java.io.IOException;
import java.io.ObjectInputStream.GetField;
import java.util.List;

import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jdom2.Document;
import org.jdom2.JDOMException;
import org.jdom2.Parent;


/**
* @author angelodel80
Expand Down Expand Up @@ -190,84 +188,78 @@ public void setSentName(String sentName) {

public static void main(String[] args) {

//ClaviusMain main1 = new ClaviusMain();

ClaviusMain lemmatizationRun = new ClaviusMain();

//main2.manageCorpus("ldt-1.5.xml");

TextHandler th = new TextHandler();

/* estrazione delle sentece da documenti TEI */

Map<String, String> sentences = th.getSentences(HandleConstants.getXmlTeiFile());

// Funzionalità per la riscrittura del testo in fullText
lemmatizationRun.writeFullText(sentences, HandleConstants.getFullTextFile());

// Funzionalità per il PoS Tagging
Object[] sents = sentences.values().toArray();
Object[] sKeys = sentences.keySet().toArray();


for(int i = 0; i< sents.length; i++){
lemmatizationRun.setMsg((String)sents[i]);
lemmatizationRun.setSentName(String.format("%s:: %s", (String)sKeys[i], lemmatizationRun.getMsg()));
System.out.println(lemmatizationRun.getSentName());
// //FIXME costruire i token prima di processare le sentences con HUNPOS!!!
lemmatizationRun.process(HandleConstants.getModelforHunPos(), HandleConstants.getPathToHunPos());
}
lemmatizationRun.writeOut(HandleConstants.getTaggedFile()); // Scrivo il contenuto del posTagging
ClaviusMain lemmatizationRun = new ClaviusMain();
TextHandler th = new TextHandler();

/* estrazione delle sentece da documenti TEI */
Map<String, String> sentences = th.getSentences(HandleConstants.getXmlTeiFile());


/* processo per la ricerca del lemma nella banca dati formario-lemmario */
/* Funzionalità per la scrittura del testo in fullText */
lemmatizationRun.writeFullText(sentences, HandleConstants.getFullTextFile());


/* Funzionalità per il PoS Tagging */
Object[] sents = sentences.values().toArray();
Object[] sKeys = sentences.keySet().toArray();

for(int i = 0; i< sents.length; i++){
lemmatizationRun.setMsg((String)sents[i]);
lemmatizationRun.setSentName(String.format("%s:: %s", (String)sKeys[i], lemmatizationRun.getMsg()));
System.out.println(lemmatizationRun.getSentName());
lemmatizationRun.process(HandleConstants.getModelforHunPos(), HandleConstants.getPathToHunPos());
}

String[] ParseTokenArgs = {HandleConstants.getTaggedFile(), HandleConstants.getTabFileAnalyzed()};
ParseToken.main(ParseTokenArgs);

/*Processo per la costruzione del file XML*/
lemmatizationRun.writeOut(HandleConstants.getTaggedFile()); // Scrivo il contenuto del posTagging


// try {
// Document xmlSentences = TextUtils.TabToXml(HandleConstants.getTabFileAnalyzed(), true);
// ClaviusUtils.makeSentenceXML(xmlSentences);
// // TODO build XML for integration purposes (Tokens and Linguistical_Analysis)
// ClaviusUtils.makeIntegrationXMLforAnalysis(xmlSentences);
// } catch (JDOMException e) {
// // TODO Auto-generated catch block
// e.getMessage();
// e.printStackTrace();
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
/* processo per la ricerca del lemma nella banca dati formario-lemmario */
ParseToken.main(new String[] {HandleConstants.getTaggedFile(), HandleConstants.getTabFileAnalyzed()});

/*Processo per la costruzione del file XML*/
try {
Document xmlSentences = TextUtils.TabToXml(HandleConstants.getTabFileAnalyzed(), true);
ClaviusUtils.makeSentenceXML(xmlSentences);
// TODO build XML for integration purposes (Tokens and Linguistical_Analysis)
ClaviusUtils.makeIntegrationXMLforAnalysis(xmlSentences);
SentencesHandler.main(new String[]{HandleConstants.getWorkDir(), HandleConstants.getLetterRif()});
} catch (JDOMException e) {
e.getMessage();
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}



/* Processo per la query al LemLat */

// ParseXMLAnalized par = new ParseXMLAnalized("C:/tmp/MP/VERG/VERG/Test1_an.xml");
// StringBuilder lemLatBuider = new StringBuilder();
// try {
// List<String> tokens = par.extractTokens();
// for (String token : tokens) {
// //System.out.println(token);
// lemLatBuider.append("TOKEN URI: " + token);
// String[] argLemLat = new String[1];
// argLemLat[0] = token.replaceAll("(.+)@+?", "").replaceAll("\\[\\d\\]", "").toLowerCase();
// lemLatBuider.append("\n\tTOKEN STRING: " + argLemLat[0]+"\n");
// LemLatQuery.main(argLemLat);
// lemLatBuider.append(LemLatQuery.analysisStringBuider() + "\n");
// //lemLatBuider.append("\t\tANALYSIS" + "\n");
// lemLatBuider.append("\n");
// TextUtils.StringToFile(lemLatBuider, "C:/tmp/MP/VERG/VERG/LemLat_17032014_an.txt");
// }
// } catch (JDOMException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
// ParseXMLAnalized par = new ParseXMLAnalized("C:/tmp/MP/VERG/VERG/Test1_an.xml");
// StringBuilder lemLatBuider = new StringBuilder();
// try {
// List<String> tokens = par.extractTokens();
// for (String token : tokens) {
// //System.out.println(token);
// lemLatBuider.append("TOKEN URI: " + token);
// String[] argLemLat = new String[1];
// argLemLat[0] = token.replaceAll("(.+)@+?", "").replaceAll("\\[\\d\\]", "").toLowerCase();
// lemLatBuider.append("\n\tTOKEN STRING: " + argLemLat[0]+"\n");
// LemLatQuery.main(argLemLat);
// lemLatBuider.append(LemLatQuery.analysisStringBuider() + "\n");
// //lemLatBuider.append("\t\tANALYSIS" + "\n");
// lemLatBuider.append("\n");
// TextUtils.StringToFile(lemLatBuider, "C:/tmp/MP/VERG/VERG/LemLat_17032014_an.txt");
// }
// } catch (JDOMException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }

//ClaviusMain main1 = new ClaviusMain();
//main1.manageCorpus("ldt-1.5.xml");


// main1.setMsg("fidelis dulcem amat virgo poetam");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
public class HandleConstants {

private final static String letterRif = "147";
private final static String TeiFile = "147_APUG_530_cc.129-130.xml";
private final static String TeiFile = "147-transcription.xml";
private final static String workDir = "C:/tmp/Clavius/TEI-MarkUp/08042014/"+letterRif+"/";
//"C:/tmp/Clavius/TEI-MarkUp/08042014/"+letterRif+"/";
//"C:/tmp/Clavius/TEI-MarkUp/08042014/136/";
Expand Down
135 changes: 135 additions & 0 deletions ClaviusLemmata/src/ilc/cnr/it/clavius/utils/SentencesHandler.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/**
*
*/
package ilc.cnr.it.clavius.utils;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.List;

import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.input.SAXBuilder;
import org.jdom2.output.Format;
import org.jdom2.output.XMLOutputter;

/**
* @author Angelo Del Grosso
*
*/
public class SentencesHandler {

private static Document sentenceAnalysis;
/**
*
*/
public SentencesHandler() {
// TODO Auto-generated constructor stub
}


private static final boolean init(String file){
boolean ret = false;
try {
SAXBuilder builder = new SAXBuilder();
sentenceAnalysis = builder.build(file);
ret = true;
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}

return ret;
}

private static final StringBuffer run(){
StringBuffer ret = null;
if(null!=sentenceAnalysis){
System.err.println(sentenceAnalysis.toString());
Element root = sentenceAnalysis.getRootElement();
List<Element> sents = root.getChildren();
handleSentences(sents);
ret = outputSentences(sentenceAnalysis);
}
return ret;
}


private static StringBuffer outputSentences(Document sentenceAnalysis2) {
// TODO Auto-generated method stub
StringBuffer ret = null;
XMLOutputter out = new XMLOutputter(Format.getPrettyFormat());
ret = new StringBuffer(out.outputString(sentenceAnalysis2));
return ret;
}


private static void handleSentences(List<Element> sents) {
// TODO Auto-generated method stub
int count = 0;
for (Element sent : sents) {
List<Element> toks = sent.getChildren();
Element firstTok = toks.get(0);
Element lastTok = toks.get(toks.size()-1);
int start = count+Integer.parseInt(firstTok.getAttributeValue("start"));
int end = count+Integer.parseInt(lastTok.getAttributeValue("end"));

sent.setAttribute("start", String.valueOf(start));
sent.setAttribute("end", String.valueOf(end));
sent.setAttribute("span", firstTok.getAttributeValue("uri")+"-"+lastTok.getAttributeValue("uri"));
count = end+1;
//System.err.println(sent.toString());
}
}


private static void BufferToFile(StringBuffer sentences, String file) {
// TODO Auto-generated method stub
BufferedWriter writer = null;
try{
File outFile = new File(file);
writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFile), "UTF-8"));
writer.write(sentences.toString());

}catch(FileNotFoundException fe){
fe.printStackTrace();
}catch(IOException ioe){
ioe.printStackTrace();
}finally{
try {
writer.flush();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}

}

}

/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
StringBuffer sentences = null;
String file = String.format("%sLetter%s_an.xml", args[0],args[1]);
String fileOut = String.format("%sLetter%s_anOUT.xml", args[0],args[1]);
if(SentencesHandler.init(file)){
sentences = SentencesHandler.run();
System.out.println(sentences.toString());
SentencesHandler.BufferToFile(sentences,fileOut);
}
else{
System.out.println("nada!!");
}



}

}

0 comments on commit 3b4ee2c

Please sign in to comment.