-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from miaaaooow/master
Sloth.Works Hackathon - GATE Experiment
- Loading branch information
Showing
56 changed files
with
4,772 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
**.idea/ | ||
*.iml | ||
**/target/** | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
**.idea/ | ||
*.iml | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
Задачи: | ||
- В закон(plain text) разпознаване на закони, алинеи и членове | ||
- LegalXML версия на закона | ||
- В поправки от ДВ - разпознаване на поправка, шаблон, за кой закон е | ||
- Генериране на diff | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
Структура на закон | ||
================== | ||
|
||
закон law | ||
глава chapter | ||
раздел section | ||
член member | ||
алинея subparagraph | ||
точка point | ||
буква letter | ||
изречение sentence | ||
|
||
изменения | ||
допълнителни разпоредби ~ раздел | ||
преходни разпоредби | ||
заключителни разпоредби | ||
параграф | ||
|
||
amendments | ||
additional provisions ~ section | ||
transitional provisions | ||
final provisions | ||
paragraph | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
Типове промени | ||
|
||
§ - членове в Допълнителни разпоредби | ||
§ - членове в Заключителни разпоредби | ||
|
||
§ или само число - дефинира предстоящи промени | ||
|
||
|
||
|
||
Създава се ... - add | ||
....се cъздава - add | ||
В - in | ||
след думата - after word | ||
|
||
|
||
думите ... се заменят ... - substirude words ... with .... | ||
|
||
в основния текст думите ... се заменят ... - substirude words ... with ... | ||
|
||
|
||
... се изменя така - is changed as follows | ||
....се изменя със - change with | ||
се добавя.... - add | ||
накрая се добавя - add at the end | ||
|
||
се добавя запетая - , | ||
се създава изречение второ - | ||
.... се отменя - ... delete | ||
... се заличават - ... delete | ||
Заглавието се изменя така... | ||
Досегашният текст става ал.1. Създава се ал. 2 | ||
|
||
|
||
Алинея 1 | ||
ал. 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<groupId>org.openlex.gate</groupId> | ||
<artifactId>gate-experiment</artifactId> | ||
<version>1.0-SNAPSHOT</version> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>uk.ac.gate</groupId> | ||
<artifactId>gate-core</artifactId> | ||
<version>8.4</version> | ||
</dependency> | ||
</dependencies> | ||
|
||
</project> |
167 changes: 167 additions & 0 deletions
167
...athon/gate-experiment/src/main/java/org/openlex/experiments/io/AnnotatedCorpusReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
package org.openlex.experiments.io; | ||
|
||
import gate.*; | ||
import gate.util.GateException; | ||
import gate.util.InvalidOffsetException; | ||
|
||
import java.io.BufferedWriter; | ||
import java.io.File; | ||
import java.io.FileWriter; | ||
import java.io.IOException; | ||
import java.util.*; | ||
|
||
/** | ||
* Created by mateva on 21.01.18. | ||
*/ | ||
public class AnnotatedCorpusReader { | ||
private static String PATH_TO_GATE = "/home/mateva/Installs/GATE"; | ||
private static String PATH_TO_GATE_PLUGINS = "/home/mateva/Installs/GATE/plugins"; | ||
|
||
private static String PATH_TO_RESOURCES = "/home/mateva/OpenLex/M/OpenLex/sloth.works.hakathon/gate-experiment/src/main/resources/"; | ||
private static String PATH_TO_FILE_RESOURCES = "file://" + PATH_TO_RESOURCES; | ||
private static String PATH_TO_RESULT_OUTPUT = PATH_TO_RESOURCES + "results/"; | ||
private static String PATH_TO_ORIGINAL_OUTPUT = PATH_TO_RESOURCES + "original/"; | ||
|
||
private static String DATA_STORE_CLASS = "gate.persist.SerialDataStore"; | ||
private static String DOC_IMPL_CLASS = "gate.corpora.DocumentImpl"; | ||
|
||
|
||
|
||
private void read() { | ||
setupAndStartGate(); | ||
|
||
DataStore annotatedLaws = null; | ||
DataStore annotatedAmendments = null; | ||
try { | ||
annotatedLaws = Factory.openDataStore(DATA_STORE_CLASS, PATH_TO_FILE_RESOURCES + "laws"); | ||
annotatedAmendments = Factory.openDataStore(DATA_STORE_CLASS, PATH_TO_FILE_RESOURCES + "amends"); | ||
|
||
List annotatedAmendmentsLrIds = annotatedAmendments.getLrIds(DOC_IMPL_CLASS); | ||
|
||
Set<Diff> diffs = new HashSet<>(); | ||
|
||
for (Object id : annotatedAmendmentsLrIds) { | ||
Document d = readDocumentFrom(annotatedAmendments, id); | ||
|
||
for (Annotation a : d.getAnnotations().get("RuleSubstitute")) { | ||
FeatureMap map = a.getFeatures(); | ||
String alNum = (String) map.get("alinea_number"); | ||
String articleNum = (String) map.get("article_number"); | ||
String what = (String) map.get("what"); | ||
String withWhat = (String) map.get("withWhat"); | ||
|
||
Diff diff = new Diff(alNum, articleNum, what, withWhat, d); | ||
diffs.add(diff); | ||
|
||
System.out.println(alNum); | ||
System.out.println(articleNum); | ||
System.out.println(what); | ||
System.out.println(withWhat); | ||
} | ||
|
||
Factory.deleteResource(d); | ||
// } | ||
} | ||
|
||
List lawsDocIds = annotatedLaws.getLrIds(DOC_IMPL_CLASS); | ||
|
||
for (Object id : lawsDocIds) { | ||
Document d = readDocumentFrom(annotatedLaws, id); | ||
|
||
String originalContent = d.getContent().toString(); | ||
String name = d.getName(); | ||
writeContentTOFileAtPath(originalContent, PATH_TO_ORIGINAL_OUTPUT + name); | ||
|
||
AnnotationSet alineaContents = d.getAnnotations().get("AlineaContent"); | ||
|
||
Map<String, String> changed = new HashMap<>(); | ||
for (Diff diff : diffs) { | ||
|
||
for (Annotation alineaContent : alineaContents) { | ||
FeatureMap features = alineaContent.getFeatures(); | ||
if (diff.getAlNum().equals(features.get("number")) | ||
&& diff.getArticleNum().equals(features.get("article_number"))) { | ||
System.out.println("Match!"); | ||
} | ||
String tosub = getPartOfDocument(d, alineaContent.getStartNode().getOffset(), | ||
alineaContent.getEndNode().getOffset()); | ||
String newVer = tosub.replaceAll(diff.getWhat(), diff.getWithWhat()); | ||
changed.put(tosub, newVer); | ||
} | ||
|
||
} | ||
for (Map.Entry<String, String> entry : changed.entrySet()) { | ||
originalContent = originalContent.replace(entry.getKey(), entry.getValue()); | ||
} | ||
|
||
writeContentTOFileAtPath(originalContent, PATH_TO_RESULT_OUTPUT + name); | ||
Factory.deleteResource(d); | ||
|
||
} | ||
|
||
} catch (GateException e) { | ||
System.out.println(e); | ||
} | ||
} | ||
|
||
private String getPartOfDocument(Document docs, Long startOffSet, Long endOffset) { | ||
try { | ||
return docs.getContent().getContent(startOffSet, endOffset).toString(); | ||
} catch (InvalidOffsetException e) { | ||
handleFuckingException(e); | ||
} | ||
return null; | ||
} | ||
|
||
private void writeOriginalFile(Document document) { | ||
String originalContent = document.getContent().toString(); | ||
String name = document.getName(); | ||
writeContentTOFileAtPath(originalContent, PATH_TO_ORIGINAL_OUTPUT + name); | ||
} | ||
|
||
private void writeContentTOFileAtPath(String content, String path) { | ||
try { | ||
BufferedWriter writer = new BufferedWriter(new FileWriter(path)); | ||
writer.write(content); | ||
writer.close(); | ||
} catch (IOException e) { | ||
handleFuckingException(e); | ||
} | ||
} | ||
|
||
private void setupAndStartGate() { | ||
if (Gate.getGateHome() == null) { | ||
Gate.setGateHome(new File(PATH_TO_GATE)); | ||
} | ||
if (Gate.getPluginsHome() == null) { | ||
Gate.setPluginsHome(new File(PATH_TO_GATE_PLUGINS)); | ||
} | ||
|
||
try { | ||
Gate.init(); | ||
} catch (GateException ge) { | ||
handleFuckingException(ge); | ||
} | ||
} | ||
|
||
private Document readDocumentFrom(DataStore ds, Object id) { | ||
try { | ||
return (Document) Factory.createResource(DOC_IMPL_CLASS, | ||
gate.Utils.featureMap(DataStore.DATASTORE_FEATURE_NAME, ds, | ||
DataStore.LR_ID_FEATURE_NAME, id)); | ||
} catch (Exception e) { | ||
handleFuckingException(e); | ||
} | ||
return null; | ||
} | ||
|
||
private void handleFuckingException(Exception e) { | ||
System.out.println(e); | ||
} | ||
|
||
public static void main(String[] args) { | ||
AnnotatedCorpusReader reader = new AnnotatedCorpusReader(); | ||
reader.read(); | ||
} | ||
|
||
} |
59 changes: 59 additions & 0 deletions
59
sloth.works.hakathon/gate-experiment/src/main/java/org/openlex/experiments/io/Diff.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
package org.openlex.experiments.io; | ||
|
||
import gate.Document; | ||
|
||
/** | ||
* Created by mateva on 21.01.18. | ||
*/ | ||
public class Diff { | ||
String alNum ; | ||
String articleNum ; | ||
String what; | ||
String withWhat ; | ||
Document doc; | ||
|
||
public Diff(String alNum, String articleNum, String what, String withWhat, Document doc) { | ||
this.alNum = alNum; | ||
this.articleNum = articleNum; | ||
this.what = what; | ||
this.withWhat = withWhat; | ||
this.doc = doc; | ||
} | ||
|
||
public Document getDoc() { | ||
return doc; | ||
} | ||
|
||
public String getAlNum() { | ||
return alNum; | ||
} | ||
|
||
public void setAlNum(String alNum) { | ||
this.alNum = alNum; | ||
} | ||
|
||
public String getArticleNum() { | ||
return articleNum; | ||
} | ||
|
||
public void setArticleNum(String articleNum) { | ||
this.articleNum = articleNum; | ||
} | ||
|
||
public String getWhat() { | ||
return what; | ||
} | ||
|
||
public void setWhat(String what) { | ||
this.what = what; | ||
} | ||
|
||
public String getWithWhat() { | ||
return withWhat; | ||
} | ||
|
||
public void setWithWhat(String withWhat) { | ||
this.withWhat = withWhat; | ||
} | ||
|
||
} |
Oops, something went wrong.