From 1c7b5ca93ff96291c0700099bc1d85d4a1b1ca1e Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Sat, 24 Oct 2020 17:28:41 -0400 Subject: [PATCH 01/25] Fix block writer bug --- .../org/opendata/curation/d4/Constants.java | 2 +- .../SignatureBlocksConsumerFactory.java | 1 + .../signature/SignatureBlocksGenerator.java | 6 ++-- .../SignatureBlocksIndexFactory.java | 5 +++ .../SignatureBlocksWriterFactory.java | 33 ++++++++++++++++--- 5 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/opendata/curation/d4/Constants.java b/src/main/java/org/opendata/curation/d4/Constants.java index 94d1601..fe8eaea 100644 --- a/src/main/java/org/opendata/curation/d4/Constants.java +++ b/src/main/java/org/opendata/curation/d4/Constants.java @@ -26,5 +26,5 @@ public final class Constants { public static final String NAME = "D4 - Data-Driven Domain Discovery"; - public static final String VERSION = "0.28.0"; + public static final String VERSION = "0.29.0.dev1"; } \ No newline at end of file diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumerFactory.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumerFactory.java index 1b48691..bfc38d6 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumerFactory.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumerFactory.java @@ -24,6 +24,7 @@ */ public interface SignatureBlocksConsumerFactory { + public void close(); public SignatureBlocksConsumer getConsumer(int[] nodeSizes); public SignatureBlocksIndex signatures() throws java.io.IOException; diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java index ad8e600..eb51f79 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java @@ -80,8 +80,6 @@ public BlockGeneratorTask( @Override public void run() { - _consumer.open(); - Integer nodeId; while ((nodeId = _queue.poll()) != null) { List sig; @@ -114,8 +112,6 @@ public void run() { ) ); } - - _consumer.close(); } } @@ -175,6 +171,8 @@ private void compute( throw new RuntimeException(ex); } + consumerFactory.close(); + Date end = new Date(); if (verbose) { System.out.println("END @ " + end); diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndexFactory.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndexFactory.java index a93fc91..2789184 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndexFactory.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndexFactory.java @@ -28,6 +28,11 @@ public class SignatureBlocksIndexFactory implements SignatureBlocksConsumerFacto private SignatureBlocksConsumer _consumer = null; private SignatureBlocksIndex _signatures = null; + + @Override + public void close() { + + } @Override public SignatureBlocksConsumer getConsumer(int[] nodeSizes) { diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriterFactory.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriterFactory.java index 10f98e3..0c3bf31 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriterFactory.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriterFactory.java @@ -18,11 +18,16 @@ package org.opendata.curation.d4.signature; import java.io.File; +import java.util.ArrayList; +import java.util.List; import org.opendata.curation.d4.signature.trim.LiberalTrimmer; import org.opendata.core.io.FileSystem; /** - * Signature blocks writer factory. + * Signature blocks writer factory. Returns consumer that are open. Since this + * factory returns the same consumer multiple times if writing to a single file + * the calling method should not close the consumer but call the close method + * of this class instead at the end of processing. * * @author Heiko Mueller */ @@ -32,18 +37,31 @@ public class SignatureBlocksWriterFactory implements SignatureBlocksConsumerFact private final File _file; private SignatureBlocksConsumer _globalConsumer = null; private final boolean _outputToDir; + private List _openConsumer = null; public SignatureBlocksWriterFactory(File file, boolean outputToDir) { _file = file; _outputToDir = outputToDir; + _openConsumer = new ArrayList<>(); + if (outputToDir) { FileSystem.createFolder(file); } else { FileSystem.createParentFolder(file); } } + + @Override + public void close() { + + for (SignatureBlocksConsumer consumer : _openConsumer) { + consumer.close(); + } + _globalConsumer = null; + _openConsumer = null; + } @Override public SignatureBlocksConsumer getConsumer(int[] nodeSizes) { @@ -51,16 +69,21 @@ public SignatureBlocksConsumer getConsumer(int[] nodeSizes) { if (_outputToDir) { String filename = "signature-blocks." + (_count++) + ".txt.gz"; File outputFile = FileSystem.joinPath(_file, filename); - return new LiberalTrimmer( - nodeSizes, - new SignatureBlocksWriter(outputFile) - ); + SignatureBlocksConsumer trimmer = new LiberalTrimmer( + nodeSizes, + new SignatureBlocksWriter(outputFile) + ); + trimmer.open(); + _openConsumer.add(trimmer); + return trimmer; } else { if (_globalConsumer == null) { _globalConsumer = new LiberalTrimmer( nodeSizes, new SignatureBlocksWriter(_file) ); + _globalConsumer.open(); + _openConsumer.add(_globalConsumer); } return _globalConsumer; } From 86e995b644faa8526a7797915d5138d23d3c137a Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Sat, 24 Oct 2020 22:17:20 -0400 Subject: [PATCH 02/25] Add CENTRIST trimmer for signature generation --- pom.xml | 4 +- .../java/org/opendata/curation/d4/D4.java | 11 +- .../curation/d4/column/ColumnExpander.java | 17 ++- .../d4/column/ExpandedColumnIndex.java | 19 ++- .../d4/column/ParallelColumnExpander.java | 11 +- .../d4/column/SingleColumnExpander.java | 5 - .../domain/ParallelLocalDomainGenerator.java | 6 +- .../d4/signature/ContextSignaturePrinter.java | 55 +------- .../SignatureBlocksConsumerFactory.java | 2 +- .../signature/SignatureBlocksGenerator.java | 5 +- .../SignatureBlocksIndexFactory.java | 10 +- .../SignatureBlocksWriterFactory.java | 18 ++- .../d4/signature/trim/BlockScoreFunction.java | 97 ++++++++++++- .../trim/CentristBlockRelevanceFilter.java | 128 ++++++++++++++++++ .../d4/signature/trim/CentristTrimmer.java | 75 +++------- .../d4/signature/trim/MinJIScore.java | 10 +- .../d4/signature/trim/PrecisionScore.java | 15 +- .../trim/SignatureTrimmerFactory.java | 36 ++++- 18 files changed, 378 insertions(+), 146 deletions(-) create mode 100644 src/main/java/org/opendata/curation/d4/signature/trim/CentristBlockRelevanceFilter.java diff --git a/pom.xml b/pom.xml index a4aab1a..eceba4d 100644 --- a/pom.xml +++ b/pom.xml @@ -70,11 +70,11 @@ true - org.opendata.curation.d4.D4 + - + org.opendata.curation.d4.explore.BlockColSim diff --git a/src/main/java/org/opendata/curation/d4/D4.java b/src/main/java/org/opendata/curation/d4/D4.java index 489d62f..0dc06e1 100644 --- a/src/main/java/org/opendata/curation/d4/D4.java +++ b/src/main/java/org/opendata/curation/d4/D4.java @@ -52,6 +52,7 @@ import org.opendata.curation.d4.signature.SignatureBlocksStream; import org.opendata.curation.d4.signature.SignatureBlocksWriterFactory; import org.opendata.curation.d4.signature.trim.SignatureTrimmer; +import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; import org.opendata.db.column.Column; import org.opendata.db.eq.CompressedTermIndexGenerator; import org.opendata.db.eq.EQIndex; @@ -152,6 +153,7 @@ public void localDomains( public void signatures( EQIndex nodeIndex, + String trimmerSpec, boolean fullSignatureConstraint, boolean ignoreLastDrop, int threads, @@ -161,7 +163,11 @@ public void signatures( ) throws java.lang.InterruptedException, java.io.IOException { SignatureBlocksWriterFactory sigWriter; - sigWriter = new SignatureBlocksWriterFactory(outputFile, false); + sigWriter = new SignatureBlocksWriterFactory( + outputFile, + new SignatureTrimmerFactory(nodeIndex, nodeIndex.columns(), trimmerSpec), + false + ); new SignatureBlocksGenerator(telemetry).runWithMaxDrop( nodeIndex, new ConcurrentLinkedQueue<>(nodeIndex.keys().toList()), @@ -367,6 +373,7 @@ public static void main(String[] args) { "eqs", " [default: 'compressed-term-index.txt.gz']" ), + new Parameter("trimmer", " [default: LIBERAL]"), new Parameter("threads", " [default: 6]"), new Parameter("verbose", " [default: true]"), new Parameter("signatures", " [default: 'signatures.txt.gz']") @@ -374,6 +381,7 @@ public static void main(String[] args) { args ); File eqFile = params.getAsFile("eqs", "compressed-term-index.txt.gz"); + String trimmerSpec = params.getAsString("trimmer", SignatureTrimmer.LIBERAL); int threads = params.getAsInt("threads", 6); boolean verbose = params.getAsBool("verbose", true); File signatureFile = params.getAsFile("signatures", "signatures.txt.gz"); @@ -382,6 +390,7 @@ public static void main(String[] args) { try { new D4().signatures( new EQIndex(eqFile), + trimmerSpec, fullSignatureConstraint, ignoreLastDrop, threads, diff --git a/src/main/java/org/opendata/curation/d4/column/ColumnExpander.java b/src/main/java/org/opendata/curation/d4/column/ColumnExpander.java index 4a19eb1..ae9e0bb 100644 --- a/src/main/java/org/opendata/curation/d4/column/ColumnExpander.java +++ b/src/main/java/org/opendata/curation/d4/column/ColumnExpander.java @@ -26,6 +26,7 @@ import org.opendata.curation.d4.signature.trim.SignatureTrimmer; import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; import org.opendata.core.constraint.Threshold; +import org.opendata.core.set.MutableIdentifiableIDSet; import org.opendata.db.eq.EQIndex; /** @@ -87,7 +88,13 @@ public void run() { if (!columnExpander.isDone()) { SignatureTrimmer trimmer; trimmer = _trimmerFactory - .getTrimmer(column.originalNodes(), columnExpander); + .getTrimmer( + new MutableIdentifiableIDSet( + column.id(), + column.originalNodes() + ), + columnExpander + ); dispatcher.add(trimmer); expanders.add(columnExpander); } else { @@ -124,7 +131,13 @@ public void run() { active.add(expander); SignatureTrimmer trimmer; trimmer = _trimmerFactory - .getTrimmer(expander.column().nodes(), expander); + .getTrimmer( + new MutableIdentifiableIDSet( + expander.column().id(), + expander.column().nodes() + ), + expander + ); dispatcher.add(trimmer); } } diff --git a/src/main/java/org/opendata/curation/d4/column/ExpandedColumnIndex.java b/src/main/java/org/opendata/curation/d4/column/ExpandedColumnIndex.java index 88e566d..232952c 100644 --- a/src/main/java/org/opendata/curation/d4/column/ExpandedColumnIndex.java +++ b/src/main/java/org/opendata/curation/d4/column/ExpandedColumnIndex.java @@ -21,7 +21,10 @@ import java.util.HashMap; import java.util.List; import org.opendata.core.set.HashIDSet; +import org.opendata.core.set.HashObjectSet; import org.opendata.core.set.IDSet; +import org.opendata.core.set.IdentifiableObjectSet; +import org.opendata.db.column.Column; /** * Create an unique index of expanded columns. Two expanded columns are @@ -81,5 +84,19 @@ public void open() { _columnIndex = new HashMap<>(); _columnList = new ArrayList<>(); _columnMapping = new HashMap<>(); - } + } + + public IdentifiableObjectSet toColumns() { + + HashObjectSet result = new HashObjectSet<>(); + + for (ExpandedColumn column : _columnList) { + IDSet nodes = column.nodes(); + for (int columnId : this.columns(column.id())) { + result.add(new Column(columnId, nodes)); + } + } + + return result; + } } diff --git a/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java b/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java index 937005e..9ff9f81 100644 --- a/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java +++ b/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java @@ -37,6 +37,7 @@ import org.opendata.core.set.HashIDSet; import org.opendata.core.set.IDSet; import org.opendata.core.set.IdentifiableObjectSet; +import org.opendata.core.set.MutableIdentifiableIDSet; import org.opendata.core.util.MemUsagePrinter; import org.opendata.curation.d4.signature.SignatureBlocksDispatcher; import org.opendata.db.column.Column; @@ -120,7 +121,13 @@ public void run() { for (SingleColumnExpander expander : columns) { SignatureTrimmer trimmer; trimmer = _trimmerFactory - .getTrimmer(expander.column().nodes(), expander); + .getTrimmer( + new MutableIdentifiableIDSet( + expander.column().id(), + expander.column().nodes() + ), + expander + ); dispatcher.add(trimmer); } round++; @@ -259,7 +266,7 @@ public void run( nodes, columns, signatures, - new SignatureTrimmerFactory(nodes, trimmer), + new SignatureTrimmerFactory(nodes, nodes.columns(), trimmer), threshold, decreaseFactor, numberOfIterations, diff --git a/src/main/java/org/opendata/curation/d4/column/SingleColumnExpander.java b/src/main/java/org/opendata/curation/d4/column/SingleColumnExpander.java index 332d6b1..143d3cc 100644 --- a/src/main/java/org/opendata/curation/d4/column/SingleColumnExpander.java +++ b/src/main/java/org/opendata/curation/d4/column/SingleColumnExpander.java @@ -97,7 +97,6 @@ public int overallSupportCount() { private boolean _done = false; private int _expansionSize; private int _iteration; - private final int _maxNodeId; private final int[] _nodeSizes; private final int _numberOfIterations; private HashMap _support; @@ -117,8 +116,6 @@ public SingleColumnExpander( _decreaseFactor = decreaseFactor; _threshold = threshold; - _maxNodeId = nodes.getMaxId(); - _done = (_numberOfIterations <= 0); _iteration = 0; @@ -174,7 +171,6 @@ public void close() { for (int nodeId : _support.keySet()) { SupportCounter sup = _support.get(nodeId); - boolean added = false; BigDecimal orgSup = null; try { orgSup = sup.originalSupport(_columnSize); @@ -201,7 +197,6 @@ public void close() { if (_threshold.isSatisfied(overallSup)) { expansionNodes.add(nodeId); expansionSize += _nodeSizes[nodeId]; - added = true; } } } diff --git a/src/main/java/org/opendata/curation/d4/domain/ParallelLocalDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/ParallelLocalDomainGenerator.java index a606c50..5e6d096 100644 --- a/src/main/java/org/opendata/curation/d4/domain/ParallelLocalDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/ParallelLocalDomainGenerator.java @@ -33,6 +33,7 @@ import org.opendata.curation.d4.signature.trim.SignatureTrimmer; import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; import org.opendata.core.set.IDSet; +import org.opendata.core.set.MutableIdentifiableIDSet; import org.opendata.curation.d4.signature.SignatureBlocksDispatcher; import org.opendata.db.eq.EQIndex; @@ -84,7 +85,8 @@ public void run() { dispatcher = new SignatureBlocksDispatcher(); for (ExpandedColumn column : _columns) { - IDSet col = column.nodes(); + MutableIdentifiableIDSet col; + col = new MutableIdentifiableIDSet(column.id(), column.nodes()); SignatureBlocksConsumer domainGenerator; domainGenerator = new UndirectedDomainGenerator( column, @@ -175,7 +177,7 @@ public void run( nodes, columns, signatures, - new SignatureTrimmerFactory(nodes, trimmer), + new SignatureTrimmerFactory(nodes, columnIndex.toColumns(), trimmer), domains, verbose ); diff --git a/src/main/java/org/opendata/curation/d4/signature/ContextSignaturePrinter.java b/src/main/java/org/opendata/curation/d4/signature/ContextSignaturePrinter.java index f54553c..0002088 100644 --- a/src/main/java/org/opendata/curation/d4/signature/ContextSignaturePrinter.java +++ b/src/main/java/org/opendata/curation/d4/signature/ContextSignaturePrinter.java @@ -102,6 +102,8 @@ public void print( } scores = new ArrayList<>(); } + + PrecisionScore scoreFunc = new PrecisionScore(eqIndex); int start = 0; final int end = sig.size(); int blockCount = 0; @@ -123,15 +125,8 @@ public void print( blocks.add(block); Arrays.sort(block); if (column != null ) { - scores.add( - this.score( - blockCount, - block, - columnNodes, - columnSize, - nodeSizes - ) - ); + BigDecimal score = scoreFunc.score(block, column.id()); + scores.add(new IdentifiableDouble(blockCount, score)); } String headline = "\n-- BLOCK " + blockCount + " (" + nodeCount + " NODES, " + termCount + " TERMS)"; if (column != null) { @@ -160,7 +155,7 @@ public void print( RobustSignatureIndex buffer = new RobustSignatureIndex(); new LiberalTrimmer( nodeSizes, - new CentristTrimmer(column, nodeSizes, buffer) + new CentristTrimmer(eqIndex, eqIndex.columns(), column, buffer) ).consume(new SignatureBlocksImpl(nodeId, BigDecimal.ONE, blocks)); System.out.println("\nSIGNATURE BLOCKS FOR COLUMN " + column.id() + "\n"); SignatureBlocks sigBlocks = buffer.get(nodeId); @@ -183,46 +178,6 @@ public void print( } } } - - private IdentifiableDouble score( - int blockId, - int[] block, - int[] column, - int columnSize, - int[] nodeSizes - ) { - final int len1 = block.length; - final int len2 = column.length; - int idx1 = 0; - int idx2 = 0; - int blSize = 0; - int overlap = 0; - while ((idx1 < len1) && (idx2 < len2)) { - final int nodeId = block[idx1]; - int comp = Integer.compare(nodeId, column[idx2]); - if (comp < 0) { - blSize += nodeSizes[nodeId]; - idx1++; - } else if (comp > 0) { - idx2++; - } else { - int nodeSize = nodeSizes[nodeId]; - blSize += nodeSize; - overlap += nodeSize; - idx1++; - idx2++; - } - } - while (idx1 < len1) { - blSize += nodeSizes[block[idx1++]]; - } - if (overlap > 0) { - BigDecimal val = new PrecisionScore().relevance(columnSize, blSize, overlap); - return new IdentifiableDouble(blockId, val.doubleValue()); - } else { - return new IdentifiableDouble(blockId, 0.0); - } - } private static final String ARG_COLUMN = "column"; private static final String ARG_FULLSIG = "fullSigConstraint"; diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumerFactory.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumerFactory.java index bfc38d6..3f84dce 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumerFactory.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumerFactory.java @@ -25,7 +25,7 @@ public interface SignatureBlocksConsumerFactory { public void close(); - public SignatureBlocksConsumer getConsumer(int[] nodeSizes); + public SignatureBlocksConsumer getConsumer(); public SignatureBlocksIndex signatures() throws java.io.IOException; } diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java index eb51f79..64b185f 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java @@ -128,7 +128,6 @@ public SignatureBlocksGenerator() { } private void compute( - EQIndex nodeIndex, ContextSignatureGenerator sigFact, ConcurrentLinkedQueue queue, CandidateSetFinder candidateFinder, @@ -160,7 +159,7 @@ private void compute( queue, sigFact, candidateFinder, - consumerFactory.getConsumer(nodeIndex.nodeSizes()) + consumerFactory.getConsumer() ) ); } @@ -208,7 +207,6 @@ public void runWithThreshold( candidateFinder = new ThresholdFinder<>(threshold); this.compute( - eqIndex, new ContextSignatureGenerator(eqIndex.nodes()), queue, candidateFinder, @@ -249,7 +247,6 @@ public void runWithMaxDrop( ); this.compute( - eqIndex, new ContextSignatureGenerator(eqIndex.nodes()), queue, candidateFinder, diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndexFactory.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndexFactory.java index 2789184..a66b63c 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndexFactory.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndexFactory.java @@ -27,19 +27,25 @@ public class SignatureBlocksIndexFactory implements SignatureBlocksConsumerFactory { private SignatureBlocksConsumer _consumer = null; + private final int[] _nodeSizes; private SignatureBlocksIndex _signatures = null; + public SignatureBlocksIndexFactory(int[] nodeSizes) { + + _nodeSizes = nodeSizes; + } + @Override public void close() { } @Override - public SignatureBlocksConsumer getConsumer(int[] nodeSizes) { + public SignatureBlocksConsumer getConsumer() { if (_consumer == null) { _signatures = new SignatureBlocksIndex(); - _consumer = new LiberalTrimmer(nodeSizes, _signatures); + _consumer = new LiberalTrimmer(_nodeSizes, _signatures); } return _consumer; } diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriterFactory.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriterFactory.java index 0c3bf31..dddaf1a 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriterFactory.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriterFactory.java @@ -20,8 +20,8 @@ import java.io.File; import java.util.ArrayList; import java.util.List; -import org.opendata.curation.d4.signature.trim.LiberalTrimmer; import org.opendata.core.io.FileSystem; +import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; /** * Signature blocks writer factory. Returns consumer that are open. Since this @@ -36,12 +36,18 @@ public class SignatureBlocksWriterFactory implements SignatureBlocksConsumerFact private int _count = 0; private final File _file; private SignatureBlocksConsumer _globalConsumer = null; + private final SignatureTrimmerFactory _trimmerFactory; private final boolean _outputToDir; private List _openConsumer = null; - public SignatureBlocksWriterFactory(File file, boolean outputToDir) { + public SignatureBlocksWriterFactory( + File file, + SignatureTrimmerFactory trimmerFactory, + boolean outputToDir + ) { _file = file; + _trimmerFactory = trimmerFactory; _outputToDir = outputToDir; _openConsumer = new ArrayList<>(); @@ -64,13 +70,12 @@ public void close() { } @Override - public SignatureBlocksConsumer getConsumer(int[] nodeSizes) { + public SignatureBlocksConsumer getConsumer() { if (_outputToDir) { String filename = "signature-blocks." + (_count++) + ".txt.gz"; File outputFile = FileSystem.joinPath(_file, filename); - SignatureBlocksConsumer trimmer = new LiberalTrimmer( - nodeSizes, + SignatureBlocksConsumer trimmer = _trimmerFactory.getTrimmer( new SignatureBlocksWriter(outputFile) ); trimmer.open(); @@ -78,8 +83,7 @@ public SignatureBlocksConsumer getConsumer(int[] nodeSizes) { return trimmer; } else { if (_globalConsumer == null) { - _globalConsumer = new LiberalTrimmer( - nodeSizes, + _globalConsumer = _trimmerFactory.getTrimmer( new SignatureBlocksWriter(_file) ); _globalConsumer.open(); diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/BlockScoreFunction.java b/src/main/java/org/opendata/curation/d4/signature/trim/BlockScoreFunction.java index 183840d..bb72b2f 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/BlockScoreFunction.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/BlockScoreFunction.java @@ -18,20 +18,105 @@ package org.opendata.curation.d4.signature.trim; import java.math.BigDecimal; +import java.util.HashMap; +import org.opendata.core.set.IdentifiableObjectSet; +import org.opendata.db.column.Column; +import org.opendata.db.eq.EQIndex; /** - * + * Score function for signature blocks. + * * @author Heiko Mueller */ -public interface BlockScoreFunction { +public abstract class BlockScoreFunction { + + private final HashMap _columns; + private final HashMap _columnSize; + private final int[] _nodeSize; + + public BlockScoreFunction( + EQIndex eqIndex, + IdentifiableObjectSet columns + ) { + + _nodeSize = eqIndex.nodeSizes(); + + _columns = new HashMap<>(); + _columnSize = new HashMap<>(); + + for (Column column : columns) { + _columns.put(column.id(), column.toArray()); + int size = 0; + for (int nodeId : column) { + size += _nodeSize[nodeId]; + } + _columnSize.put(column.id(), size); + } + + } + + /** + * Get the maximum score of a signature block over all columns. + * + * @param block + * @param columns + * @return + */ + public BigDecimal maxScore(int[] block, Iterable columns) { + + BigDecimal max = BigDecimal.ZERO; + + for (int columnId : columns) { + BigDecimal score = this.score(block, columnId); + if (score.compareTo(max) > 0) { + max = score; + } + } + + return max; + } + + public abstract BigDecimal relevance(int columnSize, int blockSize, int overlap); /** * Return score of a signature block for a given column. * - * @param columnSize - * @param blockSize - * @param overlap + * @param block + * @param columnId * @return */ - public BigDecimal relevance(int columnSize, int blockSize, int overlap); + public BigDecimal score(int[] block, int columnId) { + + final int[] column = _columns.get(columnId); + final int len1 = block.length; + final int len2 = column.length; + int idx1 = 0; + int idx2 = 0; + int blSize = 0; + int overlap = 0; + while ((idx1 < len1) && (idx2 < len2)) { + final int nodeId = block[idx1]; + int comp = Integer.compare(nodeId, column[idx2]); + if (comp < 0) { + blSize += _nodeSize[nodeId]; + idx1++; + } else if (comp > 0) { + idx2++; + } else { + int nodeSize = _nodeSize[nodeId]; + blSize += nodeSize; + overlap += nodeSize; + idx1++; + idx2++; + } + } + if (overlap > 0) { + while (idx1 < len1) { + blSize += _nodeSize[block[idx1++]]; + } + return this.relevance(_columnSize.get(columnId), blSize, overlap); + } else { + return BigDecimal.ZERO; + } + } } diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/CentristBlockRelevanceFilter.java b/src/main/java/org/opendata/curation/d4/signature/trim/CentristBlockRelevanceFilter.java new file mode 100644 index 0000000..e9a3366 --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/signature/trim/CentristBlockRelevanceFilter.java @@ -0,0 +1,128 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.signature.trim; + +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.opendata.core.constraint.GreaterThanConstraint; +import org.opendata.curation.d4.signature.SignatureBlocks; +import org.opendata.curation.d4.signature.SignatureBlocksConsumer; +import org.opendata.core.constraint.Threshold; +import org.opendata.core.constraint.ZeroThreshold; +import org.opendata.core.object.IdentifiableDouble; +import org.opendata.core.object.filter.AnyObjectFilter; +import org.opendata.core.prune.CandidateSetFinder; +import org.opendata.core.prune.MaxDropFinder; +import org.opendata.core.set.IDSet; +import org.opendata.core.set.IdentifiableObjectSet; +import org.opendata.core.sort.DoubleValueDescSort; +import org.opendata.db.column.Column; +import org.opendata.db.eq.EQIndex; + +/** + * Liberal signature blocks trimmer. The liberal trimmer prunes all + * blocks starting from the block with the most elements. Only if the first + * block is the largest block it will not be pruned. + * + * @author Heiko Mueller + */ +public class CentristBlockRelevanceFilter extends SignatureTrimmer { + + private final CandidateSetFinder _dropFinder; + private final EQIndex _eqIndex; + private final BlockScoreFunction _scoreFunc; + + public CentristBlockRelevanceFilter( + EQIndex eqIndex, + BlockScoreFunction scoreFunc, + CandidateSetFinder dropFinder, + Threshold nonEmptyConstraint, + SignatureBlocksConsumer consumer + ) { + super(new AnyObjectFilter(), nonEmptyConstraint, consumer); + + _eqIndex = eqIndex; + _scoreFunc = scoreFunc; + _dropFinder = dropFinder; + } + + public CentristBlockRelevanceFilter( + EQIndex eqIndex, + BlockScoreFunction scoreFunc, + Threshold nonEmptyConstraint, + SignatureBlocksConsumer consumer + ) { + + this( + eqIndex, + scoreFunc, + new MaxDropFinder<>( + new GreaterThanConstraint(BigDecimal.ZERO), + false, + false + ), + nonEmptyConstraint, + consumer + ); + } + + public CentristBlockRelevanceFilter( + EQIndex eqIndex, + BlockScoreFunction scoreFunc, + SignatureBlocksConsumer consumer + ) { + + this(eqIndex, scoreFunc, new ZeroThreshold(), consumer); + } + + public CentristBlockRelevanceFilter( + EQIndex eqIndex, + IdentifiableObjectSet columns, + SignatureBlocksConsumer consumer + ) { + + this(eqIndex, new PrecisionScore(eqIndex, columns), new ZeroThreshold(), consumer); + } + + @Override + public void trim(SignatureBlocks sig, SignatureBlocksConsumer consumer) { + + IDSet columns = _eqIndex.get(sig.id()).columns(); + + List elements = new ArrayList<>(); + for (int iBlock = 0; iBlock < sig.size(); iBlock++) { + int[] block = sig.get(iBlock); + Arrays.sort(block); + BigDecimal score = _scoreFunc.maxScore(block, columns); + elements.add(new IdentifiableDouble(iBlock, score)); + } + Collections.sort(elements, new DoubleValueDescSort()); + int dropIndex = _dropFinder.getPruneIndex(elements); + if (dropIndex > 0) { + if (elements.get(0).value() > 0) { + for (int i = 0; i < elements.size(); i++) { + IdentifiableDouble e = elements.get(i); + } + consumer.consume(new CentristSignature(sig, elements, dropIndex)); + } + } + } +} diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java b/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java index f24c2fc..95a31f1 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java @@ -29,9 +29,11 @@ import org.opendata.core.object.IdentifiableDouble; import org.opendata.core.prune.CandidateSetFinder; import org.opendata.core.prune.MaxDropFinder; -import org.opendata.core.set.IDSet; +import org.opendata.core.set.IdentifiableIDSet; +import org.opendata.core.set.IdentifiableObjectSet; import org.opendata.core.sort.DoubleValueDescSort; -import org.opendata.db.eq.EQHelper; +import org.opendata.db.column.Column; +import org.opendata.db.eq.EQIndex; /** * Centrist signature blocks trimmer. The centrist trimmer uses a scoring @@ -43,41 +45,35 @@ */ public class CentristTrimmer extends SignatureTrimmer { - private final int[] _column; - private final int _columnSize; + private final int _columnId; private final CandidateSetFinder _dropFinder; - private final BlockScoreFunction _func; - private final int[] _nodeSizes; - + private final BlockScoreFunction _scoreFunc; + public CentristTrimmer( - IDSet column, - int[] nodeSizes, - BlockScoreFunction func, + IdentifiableIDSet column, + BlockScoreFunction scoreFunc, CandidateSetFinder dropFinder, Threshold nonEmptyConstraint, SignatureBlocksConsumer consumer ) { super(column, nonEmptyConstraint, consumer); - _column = column.toArray(); - _nodeSizes = nodeSizes; - _func = func; + _columnId = column.id(); + _scoreFunc = scoreFunc; _dropFinder = dropFinder; - - _columnSize = EQHelper.setSize(_column, nodeSizes); } public CentristTrimmer( - IDSet column, - int[] nodeSizes, + EQIndex eqIndex, + IdentifiableObjectSet columns, + IdentifiableIDSet column, Threshold nonEmptyConstraint, SignatureBlocksConsumer consumer ) { this( column, - nodeSizes, - new PrecisionScore(), + new PrecisionScore(eqIndex, columns), new MaxDropFinder<>( new GreaterThanConstraint(BigDecimal.ZERO), false, @@ -89,14 +85,16 @@ public CentristTrimmer( } public CentristTrimmer( - IDSet column, - int[] nodeSizes, + EQIndex eqIndex, + IdentifiableObjectSet columns, + IdentifiableIDSet column, SignatureBlocksConsumer consumer ) { this( + eqIndex, + columns, column, - nodeSizes, new ZeroThreshold(), consumer ); @@ -108,37 +106,8 @@ public void trim(SignatureBlocks sig, SignatureBlocksConsumer consumer) { List elements = new ArrayList<>(); for (int iBlock = 0; iBlock < sig.size(); iBlock++) { final int[] block = sig.get(iBlock); - final int len1 = block.length; - final int len2 = _column.length; - int idx1 = 0; - int idx2 = 0; - int blSize = 0; - int overlap = 0; - while ((idx1 < len1) && (idx2 < len2)) { - final int nodeId = block[idx1]; - int comp = Integer.compare(nodeId, _column[idx2]); - if (comp < 0) { - blSize += _nodeSizes[nodeId]; - idx1++; - } else if (comp > 0) { - idx2++; - } else { - int nodeSize = _nodeSizes[nodeId]; - blSize += nodeSize; - overlap += nodeSize; - idx1++; - idx2++; - } - } - if (overlap > 0) { - while (idx1 < len1) { - blSize += _nodeSizes[block[idx1++]]; - } - BigDecimal val = _func.relevance(_columnSize, blSize, overlap); - elements.add(new IdentifiableDouble(iBlock, val.doubleValue())); - } else { - elements.add(new IdentifiableDouble(iBlock, 0.0)); - } + BigDecimal score = _scoreFunc.score(block, _columnId); + elements.add(new IdentifiableDouble(iBlock, score)); } Collections.sort(elements, new DoubleValueDescSort()); int dropIndex = _dropFinder.getPruneIndex(elements); diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/MinJIScore.java b/src/main/java/org/opendata/curation/d4/signature/trim/MinJIScore.java index 9971838..0edb0b8 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/MinJIScore.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/MinJIScore.java @@ -19,6 +19,9 @@ import java.math.BigDecimal; import java.math.MathContext; +import org.opendata.core.set.IdentifiableObjectSet; +import org.opendata.db.column.Column; +import org.opendata.db.eq.EQIndex; /** * Block score function that uses the smaller of the column size and block size @@ -26,8 +29,13 @@ * * @author Heiko Mueller */ -public class MinJIScore implements BlockScoreFunction { +public class MinJIScore extends BlockScoreFunction { + public MinJIScore(EQIndex eqIndex, IdentifiableObjectSet columns) { + + super(eqIndex, columns); + } + @Override public BigDecimal relevance(int columnSize, int blockSize, int overlap) { diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/PrecisionScore.java b/src/main/java/org/opendata/curation/d4/signature/trim/PrecisionScore.java index ef14163..aef7344 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/PrecisionScore.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/PrecisionScore.java @@ -19,13 +19,26 @@ import java.math.BigDecimal; import org.opendata.core.metric.Precision; +import org.opendata.core.set.IdentifiableObjectSet; +import org.opendata.db.column.Column; +import org.opendata.db.eq.EQIndex; /** * * @author Heiko Mueller */ -public class PrecisionScore implements BlockScoreFunction { +public class PrecisionScore extends BlockScoreFunction { + public PrecisionScore(EQIndex eqIndex, IdentifiableObjectSet columns) { + + super(eqIndex, columns); + } + + public PrecisionScore(EQIndex eqIndex) { + + super(eqIndex, eqIndex.columns()); + } + @Override public BigDecimal relevance(int columnSize, int blockSize, int overlap) { diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java index d253e8a..6aa64b7 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java @@ -19,7 +19,9 @@ import org.opendata.core.constraint.Threshold; import org.opendata.curation.d4.signature.SignatureBlocksConsumer; -import org.opendata.core.set.IDSet; +import org.opendata.core.set.IdentifiableIDSet; +import org.opendata.core.set.IdentifiableObjectSet; +import org.opendata.db.column.Column; import org.opendata.db.eq.EQIndex; /** @@ -29,15 +31,36 @@ */ public class SignatureTrimmerFactory { + private final IdentifiableObjectSet _columns; private final EQIndex _nodes; private final String _trimmerSpec; - public SignatureTrimmerFactory(EQIndex nodes, String trimmerSpec) { - + public SignatureTrimmerFactory( + EQIndex nodes, + IdentifiableObjectSet columns, + String trimmerSpec + ) { _nodes = nodes; + _columns = columns; _trimmerSpec = trimmerSpec; } + /** + * Create an instance for a signature trimmer that is column independent. + * + * @param consumer + * @return + */ + public SignatureTrimmer getTrimmer(SignatureBlocksConsumer consumer) { + + if (_trimmerSpec.equals(SignatureTrimmer.CENTRIST)) { + return new CentristBlockRelevanceFilter(_nodes, _columns, consumer); + } else if (_trimmerSpec.equals(SignatureTrimmer.LIBERAL)) { + return new LiberalTrimmer(_nodes.nodeSizes(), consumer); + } + throw new IllegalArgumentException(String.format("Invalid trimmer: %s", _trimmerSpec)); + } + /** * Get column specific trimmer for a given column. We currently do not make * use of the empty signature constraint. @@ -46,18 +69,19 @@ public SignatureTrimmerFactory(EQIndex nodes, String trimmerSpec) { * @param consumer * @return */ - public SignatureTrimmer getTrimmer(IDSet column, SignatureBlocksConsumer consumer) { + public SignatureTrimmer getTrimmer(IdentifiableIDSet column, SignatureBlocksConsumer consumer) { if (_trimmerSpec.equals(SignatureTrimmer.CONSERVATIVE)) { return new ConservativeTrimmer(column, consumer); } else if (_trimmerSpec.equals(SignatureTrimmer.CENTRIST)) { - return new CentristTrimmer(column, _nodes.nodeSizes(), consumer); + return new CentristTrimmer(_nodes, _columns, column, consumer); } else if (_trimmerSpec.startsWith(SignatureTrimmer.CENTRIST)) { int pos = _trimmerSpec.indexOf(":"); if (pos != -1) { return new CentristTrimmer( + _nodes, + _columns, column, - _nodes.nodeSizes(), Threshold.getConstraint(_trimmerSpec.substring(pos + 1)), consumer ); From 66d84fdfc3b4fa621fb3586019ae4408d02ec4ce Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Sat, 24 Oct 2020 22:45:53 -0400 Subject: [PATCH 03/25] Add block usage writer --- .../d4/prov/CentristBlockUsageWriter.java | 190 ++++++++++++++++++ .../d4/signature/trim/CentristTrimmer.java | 38 ++++ 2 files changed, 228 insertions(+) create mode 100644 src/main/java/org/opendata/curation/d4/prov/CentristBlockUsageWriter.java diff --git a/src/main/java/org/opendata/curation/d4/prov/CentristBlockUsageWriter.java b/src/main/java/org/opendata/curation/d4/prov/CentristBlockUsageWriter.java new file mode 100644 index 0000000..558eb44 --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/prov/CentristBlockUsageWriter.java @@ -0,0 +1,190 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.prov; + +import java.io.File; +import java.io.PrintWriter; +import java.util.Arrays; +import java.util.HashMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.opendata.core.io.FileSystem; +import org.opendata.core.set.HashIDSet; +import org.opendata.core.set.HashObjectSet; +import org.opendata.core.set.IdentifiableObjectSet; +import org.opendata.curation.d4.Constants; +import org.opendata.curation.d4.column.ExpandedColumn; +import org.opendata.curation.d4.column.ExpandedColumnIndex; +import org.opendata.curation.d4.column.ExpandedColumnReader; +import org.opendata.curation.d4.signature.ConcurrentSignatureBlocksStream; +import org.opendata.curation.d4.signature.SignatureBlocks; +import org.opendata.curation.d4.signature.trim.CentristTrimmer; +import org.opendata.curation.d4.signature.trim.PrecisionScore; +import org.opendata.db.Database; +import org.opendata.db.column.Column; +import org.opendata.db.eq.EQ; +import org.opendata.db.eq.EQIndex; +import org.opendata.db.eq.Node; + +/** + * For each signature count the blocks (and their size) for those blocks that + * are part of at least one centrist column signature. + * + * @author heiko + */ +public class CentristBlockUsageWriter { + + private class OverlapComputer implements Runnable { + + private final EQIndex _eqIndex; + private final File _outputFile; + private final ConcurrentSignatureBlocksStream _signatures; + private final HashMap _trimmer; + public OverlapComputer( + EQIndex eqIndex, + HashMap trimmer, + ConcurrentSignatureBlocksStream signatures, + File outputFile + ) { + _eqIndex = eqIndex; + _trimmer = trimmer; + _signatures = signatures; + _outputFile = outputFile; + } + + @Override + public void run() { + + try (PrintWriter out = FileSystem.openPrintWriter(_outputFile)) { + SignatureBlocks sig; + while ((sig = _signatures.next()) != null) { + HashIDSet blocks = new HashIDSet(); + for (int columnId : _eqIndex.get(sig.id()).columns()) { + CentristTrimmer trimmer = _trimmer.get(columnId); + blocks.add(trimmer.trimmedBlocks(sig)); + } + int sigSize = 0; + int usedBlocksSize = 0; + for (int iBlock = 0; iBlock < sig.size(); iBlock++) { + int blockLen = sig.get(iBlock).length; + sigSize += blockLen; + if (blocks.contains(iBlock)) { + usedBlocksSize += blockLen; + } + } + out.println( + String.format( + "%d\t%d\t%d\t%d\t%d", + sig.id(), + sig.size(), + blocks.length(), + sigSize, + usedBlocksSize + ) + ); + } + } catch (java.io.IOException ex) { + throw new RuntimeException(ex); + } + } + } + + public void run( + EQIndex eqIndex, + ConcurrentSignatureBlocksStream signatures, + int threads, + File outputDir + ) { + + FileSystem.createFolder(outputDir); + + PrecisionScore scoreFunc = new PrecisionScore(eqIndex); + + HashMap trimmer = new HashMap<>(); + for (Column column : eqIndex.columns()) { + trimmer.put(column.id(), new CentristTrimmer(column, scoreFunc)); + } + + ExecutorService es = Executors.newCachedThreadPool(); + for (int iThread = 0; iThread < threads; iThread++) { + String filename = "centrist-blocks-usage." + iThread + ".tsv.gz"; + File outputFile = FileSystem.joinPath(outputDir, filename); + OverlapComputer thread; + thread = new OverlapComputer(eqIndex, trimmer, signatures, outputFile); + es.execute(thread); + } + es.shutdown(); + try { + es.awaitTermination(threads, TimeUnit.DAYS); + } catch (java.lang.InterruptedException ex) { + throw new RuntimeException(ex); + } + } + + private final static String COMMAND = + "Usage:\n" + + " \n" + + " \n" + + " \n" + + " "; + + private final static Logger LOGGER = Logger + .getLogger(CentristBlockUsageWriter.class.getName()); + + public static void main(String[] args) { + + System.out.println("Centrist Overlap Writer - Version (" + Constants.VERSION + ")\n"); + + if (args.length != 4) { + System.out.println(COMMAND); + System.exit(-1); + } + + File eqFile = new File(args[0]); + File signatureDir = new File(args[1]); + int threads = Integer.parseInt(args[2]); + File outputDir = new File(args[3]); + + EQIndex eqIndex = null; + try { + eqIndex = new EQIndex(eqFile); + } catch (java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "READ EQs", ex); + System.exit(-1); + } + + ConcurrentSignatureBlocksStream signatures = null; + try { + signatures = new ConcurrentSignatureBlocksStream(signatureDir); + } catch (java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "READ SIGNATURES", ex); + System.exit(-1); + } + + try { + new CentristBlockUsageWriter() + .run(eqIndex, signatures, threads, outputDir); + } catch (java.lang.RuntimeException ex) { + LOGGER.log(Level.SEVERE, "RUN", ex); + System.exit(-1); + } + } +} diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java b/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java index 95a31f1..e015659 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java @@ -29,6 +29,8 @@ import org.opendata.core.object.IdentifiableDouble; import org.opendata.core.prune.CandidateSetFinder; import org.opendata.core.prune.MaxDropFinder; +import org.opendata.core.set.HashIDSet; +import org.opendata.core.set.IDSet; import org.opendata.core.set.IdentifiableIDSet; import org.opendata.core.set.IdentifiableObjectSet; import org.opendata.core.sort.DoubleValueDescSort; @@ -100,6 +102,24 @@ public CentristTrimmer( ); } + public CentristTrimmer( + IdentifiableIDSet column, + BlockScoreFunction scoreFunc + ) { + + this( + column, + scoreFunc, + new MaxDropFinder<>( + new GreaterThanConstraint(BigDecimal.ZERO), + false, + false + ), + new ZeroThreshold(), + null + ); + } + @Override public void trim(SignatureBlocks sig, SignatureBlocksConsumer consumer) { @@ -117,4 +137,22 @@ public void trim(SignatureBlocks sig, SignatureBlocksConsumer consumer) { } } } + + public IDSet trimmedBlocks(SignatureBlocks sig) { + + List elements = new ArrayList<>(); + for (int iBlock = 0; iBlock < sig.size(); iBlock++) { + final int[] block = sig.get(iBlock); + BigDecimal score = _scoreFunc.score(block, _columnId); + elements.add(new IdentifiableDouble(iBlock, score)); + } + Collections.sort(elements, new DoubleValueDescSort()); + int dropIndex = _dropFinder.getPruneIndex(elements); + + HashIDSet result = new HashIDSet(); + for (int iEl = 0; iEl < dropIndex; iEl++) { + result.add(elements.get(iEl).id()); + } + return result; + } } From 52d8fbcc1bd892852eb8766ab2bd9587c0cf1755 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Sat, 24 Oct 2020 22:47:59 -0400 Subject: [PATCH 04/25] Add concurrent signature stream --- .../d4/prov/CentristBlockUsageWriter.java | 9 -- .../ConcurrentSignatureBlocksStream.java | 84 +++++++++++++++++++ 2 files changed, 84 insertions(+), 9 deletions(-) create mode 100644 src/main/java/org/opendata/curation/d4/signature/ConcurrentSignatureBlocksStream.java diff --git a/src/main/java/org/opendata/curation/d4/prov/CentristBlockUsageWriter.java b/src/main/java/org/opendata/curation/d4/prov/CentristBlockUsageWriter.java index 558eb44..3a98edb 100644 --- a/src/main/java/org/opendata/curation/d4/prov/CentristBlockUsageWriter.java +++ b/src/main/java/org/opendata/curation/d4/prov/CentristBlockUsageWriter.java @@ -19,7 +19,6 @@ import java.io.File; import java.io.PrintWriter; -import java.util.Arrays; import java.util.HashMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -28,21 +27,13 @@ import java.util.logging.Logger; import org.opendata.core.io.FileSystem; import org.opendata.core.set.HashIDSet; -import org.opendata.core.set.HashObjectSet; -import org.opendata.core.set.IdentifiableObjectSet; import org.opendata.curation.d4.Constants; -import org.opendata.curation.d4.column.ExpandedColumn; -import org.opendata.curation.d4.column.ExpandedColumnIndex; -import org.opendata.curation.d4.column.ExpandedColumnReader; import org.opendata.curation.d4.signature.ConcurrentSignatureBlocksStream; import org.opendata.curation.d4.signature.SignatureBlocks; import org.opendata.curation.d4.signature.trim.CentristTrimmer; import org.opendata.curation.d4.signature.trim.PrecisionScore; -import org.opendata.db.Database; import org.opendata.db.column.Column; -import org.opendata.db.eq.EQ; import org.opendata.db.eq.EQIndex; -import org.opendata.db.eq.Node; /** * For each signature count the blocks (and their size) for those blocks that diff --git a/src/main/java/org/opendata/curation/d4/signature/ConcurrentSignatureBlocksStream.java b/src/main/java/org/opendata/curation/d4/signature/ConcurrentSignatureBlocksStream.java new file mode 100644 index 0000000..6d249e6 --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/signature/ConcurrentSignatureBlocksStream.java @@ -0,0 +1,84 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.signature; + +import java.io.BufferedReader; +import java.io.File; +import java.math.BigDecimal; +import java.util.LinkedList; +import org.opendata.core.io.FileSetReader; +import org.opendata.core.io.FileSystem; + +/** + * Reader for a signature blocks file. Generates a stream of signature blocks + * for a given consumer. + * + * @author Heiko Mueller + */ +public class ConcurrentSignatureBlocksStream extends FileSetReader { + + private final LinkedList _files; + private BufferedReader _in = null; + + public ConcurrentSignatureBlocksStream(File file, boolean verbose) throws java.io.IOException { + + super(file, verbose); + + _files = new LinkedList<>(); + for (File inputFile : this) { + _files.add(inputFile); + } + + if (!_files.isEmpty()) { + _in = FileSystem.openReader(_files.pop()); + } + } + + public ConcurrentSignatureBlocksStream(File file) throws java.io.IOException { + + this(file, false); + } + + public synchronized SignatureBlocks next() throws java.io.IOException { + + while (_in != null) { + String line = _in.readLine(); + if (line != null) { + String[] tokens = line.split("\t"); + int[][] blocks = new int[tokens.length - 2][]; + for (int iToken = 2; iToken < tokens.length; iToken++) { + blocks[iToken - 2] = SignatureBlocksReader. + getBlockNodes(tokens[iToken]); + } + return new SignatureBlocksImpl( + Integer.parseInt(tokens[0]), + new BigDecimal(tokens[1]), + blocks + ); + } else { + _in.close(); + _in = null; + if (!_files.isEmpty()) { + _in = FileSystem.openReader(_files.pop()); + } + } + } + + return null; + } +} From e0ccf584bc380c3e1f4b7191121cde2929049e20 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Sat, 24 Oct 2020 22:55:43 -0400 Subject: [PATCH 05/25] Copy signature blocks reader from prov branch --- pom.xml | 2 +- .../org/opendata/curation/d4/Constants.java | 2 +- .../d4/prov/CentristBlockUsageWriter.java | 2 +- .../d4/signature/SignatureBlocksReader.java | 26 +++++++++++++++++-- 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index eceba4d..79d60f3 100644 --- a/pom.xml +++ b/pom.xml @@ -74,7 +74,7 @@ - org.opendata.curation.d4.explore.BlockColSim + org.opendata.curation.d4.prov.CentristBlockUsageWriter diff --git a/src/main/java/org/opendata/curation/d4/Constants.java b/src/main/java/org/opendata/curation/d4/Constants.java index fe8eaea..aa0143e 100644 --- a/src/main/java/org/opendata/curation/d4/Constants.java +++ b/src/main/java/org/opendata/curation/d4/Constants.java @@ -26,5 +26,5 @@ public final class Constants { public static final String NAME = "D4 - Data-Driven Domain Discovery"; - public static final String VERSION = "0.29.0.dev1"; + public static final String VERSION = "0.29.0.dev2"; } \ No newline at end of file diff --git a/src/main/java/org/opendata/curation/d4/prov/CentristBlockUsageWriter.java b/src/main/java/org/opendata/curation/d4/prov/CentristBlockUsageWriter.java index 3a98edb..42fdfb1 100644 --- a/src/main/java/org/opendata/curation/d4/prov/CentristBlockUsageWriter.java +++ b/src/main/java/org/opendata/curation/d4/prov/CentristBlockUsageWriter.java @@ -142,7 +142,7 @@ public void run( public static void main(String[] args) { - System.out.println("Centrist Overlap Writer - Version (" + Constants.VERSION + ")\n"); + System.out.println("Centrist Block Usage Writer - Version (" + Constants.VERSION + ")\n"); if (args.length != 4) { System.out.println(COMMAND); diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksReader.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksReader.java index 712c1a6..b792996 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksReader.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksReader.java @@ -20,9 +20,10 @@ import java.io.BufferedReader; import java.io.File; import java.math.BigDecimal; +import java.util.Arrays; +import java.util.List; import org.opendata.core.io.FileSetReader; import org.opendata.core.io.FileSystem; -import org.opendata.core.util.ArrayHelper; /** * Reader for a signature blocks file. Generates a stream of signature blocks @@ -42,6 +43,27 @@ public SignatureBlocksReader(File file) { this(file, false); } + public SignatureBlocksReader(List files) { + + super(files, false); + } + + public static int[] getBlockNodes(String text) { + + String[] tokens = text.split(","); + int[] nodes = new int[tokens.length]; + for (int iToken = 0; iToken < tokens.length; iToken++) { + String val = tokens[iToken]; + int pos = val.indexOf(":"); + if (pos != -1) { + val = val.substring(0, pos); + } + nodes[iToken] = Integer.parseInt(val); + } + Arrays.sort(nodes); + return nodes; + } + public SignatureBlocksIndex read() throws java.io.IOException { SignatureBlocksIndex buffer = new SignatureBlocksIndex(); @@ -61,7 +83,7 @@ public void stream(SignatureBlocksConsumer consumer) throws java.io.IOException String[] tokens = line.split("\t"); int[][] blocks = new int[tokens.length - 2][]; for (int iToken = 2; iToken < tokens.length; iToken++) { - blocks[iToken - 2] = ArrayHelper.arrayFromString(tokens[iToken]); + blocks[iToken - 2] = this.getBlockNodes(tokens[iToken]); } SignatureBlocks sig = new SignatureBlocksImpl( Integer.parseInt(tokens[0]), From 54b9733b1659c8b8aff5f9176fd9c876afe5e624 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Tue, 27 Oct 2020 16:15:47 -0400 Subject: [PATCH 06/25] Add signature drop information writer --- pom.xml | 2 +- .../signature/SignatureBlocksDropWriter.java | 254 ++++++++++++++++++ 2 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 src/main/java/org/opendata/curation/d4/signature/SignatureBlocksDropWriter.java diff --git a/pom.xml b/pom.xml index 79d60f3..5a5b3ff 100644 --- a/pom.xml +++ b/pom.xml @@ -74,7 +74,7 @@ - org.opendata.curation.d4.prov.CentristBlockUsageWriter + org.opendata.curation.d4.signature.SignatureBlocksDropWriter diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksDropWriter.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksDropWriter.java new file mode 100644 index 0000000..cc9ca42 --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksDropWriter.java @@ -0,0 +1,254 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.signature; + +import java.io.File; +import java.io.PrintWriter; +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.opendata.core.constraint.GreaterThanConstraint; +import org.opendata.core.io.FileSystem; +import org.opendata.core.io.SynchronizedWriter; +import org.opendata.core.prune.CandidateSetFinder; +import org.opendata.core.prune.MaxDropFinder; +import org.opendata.core.set.IDSet; +import org.opendata.db.eq.EQIndex; + +/** + * Generate output file containing information about the steepest drops in + * context signatures. + * + * The output contains a single tab-delimited line for each equivalence class + * containing the following information: + * + * - equivalence class identifier + * - similarity of first context signature entry + * - list of similarities for nodes where the steepest drop occurs (separated + * by ':'. + * + * @author Heiko Mueller + */ +public class SignatureBlocksDropWriter { + + private class SignatureDrop { + + private final int _blockLength; + private final int _columnCount; + private final BigDecimal _firstElement; + private final BigDecimal _lastElement; + + public SignatureDrop(BigDecimal first, BigDecimal last, int blockLength, int columnCount) { + + _firstElement = first; + _lastElement = last; + _blockLength = blockLength; + _columnCount = columnCount; + } + + @Override + public String toString() { + + return String.format( + "%s-%s:%d:%d", + _firstElement.setScale(2, RoundingMode.HALF_DOWN).toPlainString(), + _lastElement.setScale(2, RoundingMode.HALF_DOWN).toPlainString(), + _blockLength, + _columnCount + ); + } + } + + private class BlockGeneratorTask implements Runnable { + + private final CandidateSetFinder _candidateFinder; + private final EQIndex _eqIndex; + private final ConcurrentLinkedQueue _queue; + private final ContextSignatureGenerator _sigFact; + private final SynchronizedWriter _writer; + + public BlockGeneratorTask( + EQIndex eqIndex, + ConcurrentLinkedQueue queue, + ContextSignatureGenerator sigFact, + CandidateSetFinder candidateFinder, + SynchronizedWriter writer + ) { + _eqIndex = eqIndex; + _queue = queue; + _sigFact = sigFact; + _candidateFinder = candidateFinder; + _writer = writer; + } + + @Override + public void run() { + + Integer nodeId; + while ((nodeId = _queue.poll()) != null) { + List sig; + sig = _sigFact.getSignature(nodeId).rankedElements(); + // No output if the context signautre is empty + if (sig.isEmpty()) { + continue; + } + int start = 0; + final int end = sig.size(); + ArrayList drops = new ArrayList<>(); + while (start < end) { + int pruneIndex = _candidateFinder.getPruneIndex(sig, start); + if (pruneIndex <= start) { + break; + } + int blockLen = pruneIndex - start; + IDSet columns = _eqIndex.get(nodeId).columns(); + for (int iEl = start; iEl < pruneIndex; iEl++) { + int memberId = sig.get(iEl).id(); + columns = columns.intersect(_eqIndex.get(memberId).columns()); + if (columns.isEmpty()) { + break; + } + } + drops.add( + new SignatureDrop( + sig.get(start).toBigDecimal(), + sig.get(pruneIndex - 1).toBigDecimal(), + blockLen, + columns.length() + ) + ); + start = pruneIndex; + } + if (drops.isEmpty()) { + continue; + } + String line = nodeId + "\t" + drops.get(0).toString(); + for (int iDrop = 1; iDrop < drops.size(); iDrop++) { + line += "|" + drops.get(iDrop).toString(); + } + _writer.write(line); + } + } + } + + public void run( + EQIndex eqIndex, + ContextSignatureGenerator sigFact, + ConcurrentLinkedQueue queue, + CandidateSetFinder candidateFinder, + int threads, + boolean verbose, + SynchronizedWriter writer + ) throws java.lang.InterruptedException, java.io.IOException { + + if (verbose) { + System.out.println( + String.format( + "SIGNATURE BLOCKS FOR %d EQs USING:\n" + + " --threads=%d", + queue.size(), + threads + ) + ); + } + + Date start = new Date(); + if (verbose) { + System.out.println("START @ " + start); + } + + ExecutorService es = Executors.newCachedThreadPool(); + for (int iThread = 0; iThread < threads; iThread++) { + es.execute( + new BlockGeneratorTask( + eqIndex, + queue, + sigFact, + candidateFinder, + writer + ) + ); + } + es.shutdown(); + try { + es.awaitTermination(threads, TimeUnit.DAYS); + } catch (java.lang.InterruptedException ex) { + throw new RuntimeException(ex); + } + + Date end = new Date(); + if (verbose) { + System.out.println("END @ " + end); + } + } + + private final static String COMMAND = + "Usage:\n" + + " \n" + + " \n" + + " "; + + private final static Logger LOGGER = Logger + .getLogger(SignatureBlocksDropWriter.class.getName()); + + public static void main(String[] args) { + + if (args.length != 3) { + System.out.println(COMMAND); + System.exit(-1); + } + + File eqFile = new File(args[0]); + int threads = Integer.parseInt(args[1]); + File outputFile = new File(args[2]); + + boolean fullSignatureConstraint = false; + boolean ignoreLastDrop = true; + + MaxDropFinder candidateFinder; + candidateFinder = new MaxDropFinder<>( + new GreaterThanConstraint(BigDecimal.ZERO), + fullSignatureConstraint, + ignoreLastDrop + ); + + try (PrintWriter out = FileSystem.openPrintWriter(outputFile)) { + EQIndex eqIndex = new EQIndex(eqFile); + new SignatureBlocksDropWriter().run( + eqIndex, + new ContextSignatureGenerator(eqIndex.nodes()), + new ConcurrentLinkedQueue<>(eqIndex.keys().toList()), + candidateFinder, + threads, + true, + new SynchronizedWriter(out) + ); + } catch (java.lang.InterruptedException | java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "RUN", ex); + System.exit(-1); + } + } +} From b8a5c6c9edab9dd26f0b892a7d8608b4c285f09b Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Wed, 28 Oct 2020 18:49:33 -0400 Subject: [PATCH 07/25] Add column support trimmer --- pom.xml | 2 +- .../org/opendata/curation/d4/Constants.java | 2 +- .../curation/d4/experiments/GTWriter.java | 154 ++++++++++++++++++ .../trim/ColumnSupportBlockFilter.java | 95 +++++++++++ .../d4/signature/trim/SignatureTrimmer.java | 1 + .../trim/SignatureTrimmerFactory.java | 9 +- 6 files changed, 258 insertions(+), 5 deletions(-) create mode 100644 src/main/java/org/opendata/curation/d4/experiments/GTWriter.java create mode 100644 src/main/java/org/opendata/curation/d4/signature/trim/ColumnSupportBlockFilter.java diff --git a/pom.xml b/pom.xml index 5a5b3ff..2506651 100644 --- a/pom.xml +++ b/pom.xml @@ -74,7 +74,7 @@ - org.opendata.curation.d4.signature.SignatureBlocksDropWriter + org.opendata.curation.d4.experiments.GTWriter diff --git a/src/main/java/org/opendata/curation/d4/Constants.java b/src/main/java/org/opendata/curation/d4/Constants.java index aa0143e..f44534b 100644 --- a/src/main/java/org/opendata/curation/d4/Constants.java +++ b/src/main/java/org/opendata/curation/d4/Constants.java @@ -26,5 +26,5 @@ public final class Constants { public static final String NAME = "D4 - Data-Driven Domain Discovery"; - public static final String VERSION = "0.29.0.dev2"; + public static final String VERSION = "0.29.0.dev3"; } \ No newline at end of file diff --git a/src/main/java/org/opendata/curation/d4/experiments/GTWriter.java b/src/main/java/org/opendata/curation/d4/experiments/GTWriter.java new file mode 100644 index 0000000..ba632f2 --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/experiments/GTWriter.java @@ -0,0 +1,154 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.experiments; + +import java.io.BufferedReader; +import java.io.File; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.opendata.core.io.FileSystem; +import org.opendata.db.eq.EQ; +import org.opendata.db.eq.EQIndex; +import org.opendata.db.term.Term; +import org.opendata.db.term.TermConsumer; +import org.opendata.db.term.TermIndexReader; + +/** + * Find node and term identifier for ground truth domain terms. + * + * @author Heiko Mueller + */ +public class GTWriter { + + private class TermCollector implements TermConsumer { + + private final HashSet _names; + private HashMap _terms; + + public TermCollector(HashSet names) { + + _names = names; + } + + @Override + public void close() { + + } + + @Override + public void consume(Term term) { + + if (_names.contains(term.name())) { + _terms.put(term.name(), term); + } + } + + @Override + public void open() { + + _terms = new HashMap<>(); + } + + public HashMap terms() { + + return _terms; + } + } + + public void run( + EQIndex eqIndex, + TermIndexReader termReader, + HashSet terms, + PrintWriter out + ) throws java.io.IOException { + + TermCollector collector = new TermCollector(terms); + termReader.read(collector); + + System.out.println( + String.format( + "FOUND %d OF %d TERMS", + collector.terms().size(), + terms.size() + ) + ); + + List foundTerms = new ArrayList<>(collector.terms().keySet()); + Collections.sort(foundTerms); + + for (String name : foundTerms) { + Term term = collector.terms().get(name); + for (EQ node : eqIndex) { + if (node.terms().contains(term.id())) { + out.println(String.format("%d\t%d\t%s", node.id(), term.id(), name)); + break; + } + } + } + } + + private static final String COMMAND = + "Usage:\n" + + " \n" + + " \n" + + " \n" + + " "; + + public static void main(String[] args) { + + if (args.length != 4) { + System.out.println(COMMAND); + System.exit(-1); + } + + File eqFile = new File(args[0]); + File termIndex = new File(args[1]); + File inputDir = new File(args[2]); + File outputDir = new File(args[3]); + + FileSystem.createFolder(outputDir); + + try { + EQIndex eqIndex = new EQIndex(eqFile); + TermIndexReader termReader = new TermIndexReader(termIndex); + for (File inFile : inputDir.listFiles()) { + System.out.println(inFile.getName()); + File outFile = FileSystem.joinPath(outputDir, inFile.getName()); + HashSet terms = new HashSet<>(); + try (BufferedReader in = FileSystem.openReader(inFile)) { + String line; + while ((line = in.readLine()) != null) { + terms.add(line.split("\t")[2]); + } + } + try (PrintWriter out = FileSystem.openPrintWriter(outFile)) { + new GTWriter().run(eqIndex, termReader, terms, out); + } + } + } catch (java.io.IOException ex) { + Logger.getLogger(GTWriter.class.getName()).log(Level.SEVERE, "RUN", ex); + System.exit(-1); + } + } +} diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/ColumnSupportBlockFilter.java b/src/main/java/org/opendata/curation/d4/signature/trim/ColumnSupportBlockFilter.java new file mode 100644 index 0000000..350f866 --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/signature/trim/ColumnSupportBlockFilter.java @@ -0,0 +1,95 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.signature.trim; + +import org.opendata.curation.d4.signature.SignatureBlocks; +import org.opendata.curation.d4.signature.SignatureBlocksConsumer; +import org.opendata.core.constraint.Threshold; +import org.opendata.core.constraint.ZeroThreshold; +import org.opendata.core.object.filter.AnyObjectFilter; +import org.opendata.core.set.IDSet; +import org.opendata.curation.d4.signature.SignatureBlocksImpl; +import org.opendata.db.eq.EQIndex; + +/** + * Filter signature blocks based on column support. Includes only those blocks + * the contain nodes that all occur together in at least one column. + * + * @author Heiko Mueller + */ +public class ColumnSupportBlockFilter extends SignatureTrimmer { + + private final EQIndex _eqIndex; + private final int _minStart; + + public ColumnSupportBlockFilter( + EQIndex eqIndex, + int minStart, + Threshold nonEmptyConstraint, + SignatureBlocksConsumer consumer + ) { + super(new AnyObjectFilter(), nonEmptyConstraint, consumer); + + _eqIndex = eqIndex; + _minStart = minStart; + } + + public ColumnSupportBlockFilter( + EQIndex eqIndex, + int minStart, + SignatureBlocksConsumer consumer + ) { + + this(eqIndex, minStart, new ZeroThreshold(), consumer); + } + + public ColumnSupportBlockFilter( + EQIndex eqIndex, + SignatureBlocksConsumer consumer + ) { + + this(eqIndex, 0, new ZeroThreshold(), consumer); + } + + @Override + public void trim(SignatureBlocks sig, SignatureBlocksConsumer consumer) { + + IDSet nodeColumns = _eqIndex.get(sig.id()).columns(); + + int lastIndex = 0; + for (int iBlock = 0; iBlock < sig.size(); iBlock++) { + IDSet columns = nodeColumns; + for (int nodeId : sig.get(iBlock)) { + columns = columns.intersect(_eqIndex.get(nodeId).columns()); + if (columns.isEmpty()) { + break; + } + } + if (columns.isEmpty()) { + break; + } + lastIndex++; + } + int sigSize = Math.max(_minStart, lastIndex); + int[][] blocks = new int[sigSize][]; + for (int iBlock = 0; iBlock < blocks.length; iBlock++) { + blocks[iBlock] = sig.get(iBlock); + } + consumer.consume(new SignatureBlocksImpl(sig.id(), sig.maxSim(), blocks)); + } +} diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmer.java b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmer.java index 03b0351..a6993ce 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmer.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmer.java @@ -35,6 +35,7 @@ public abstract class SignatureTrimmer implements SignatureBlocksConsumer { * Global variables for trimmer types */ public final static String CENTRIST = "CENTRIST"; + public final static String COLSUPP = "COLSUPP"; public final static String CONSERVATIVE = "CONSERVATIVE"; public final static String LIBERAL = "LIBERAL"; diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java index 6aa64b7..ea626db 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java @@ -53,8 +53,8 @@ public SignatureTrimmerFactory( */ public SignatureTrimmer getTrimmer(SignatureBlocksConsumer consumer) { - if (_trimmerSpec.equals(SignatureTrimmer.CENTRIST)) { - return new CentristBlockRelevanceFilter(_nodes, _columns, consumer); + if (_trimmerSpec.equals(SignatureTrimmer.COLSUPP)) { + return new ColumnSupportBlockFilter(_nodes,consumer); } else if (_trimmerSpec.equals(SignatureTrimmer.LIBERAL)) { return new LiberalTrimmer(_nodes.nodeSizes(), consumer); } @@ -69,7 +69,10 @@ public SignatureTrimmer getTrimmer(SignatureBlocksConsumer consumer) { * @param consumer * @return */ - public SignatureTrimmer getTrimmer(IdentifiableIDSet column, SignatureBlocksConsumer consumer) { + public SignatureTrimmer getTrimmer( + IdentifiableIDSet column, + SignatureBlocksConsumer consumer + ) { if (_trimmerSpec.equals(SignatureTrimmer.CONSERVATIVE)) { return new ConservativeTrimmer(column, consumer); From 07c070848141c7116394a85c8c3f094a7cfe0bc5 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Wed, 28 Oct 2020 19:41:03 -0400 Subject: [PATCH 08/25] Add GT best match --- pom.xml | 2 +- .../curation/d4/experiments/BestGTMatch.java | 83 +++++++++++++++++++ .../curation/d4/experiments/GTReader.java | 45 ++++++++++ .../experiments/StrongDomainJsonReader.java | 79 ++++++++++++++++++ 4 files changed, 208 insertions(+), 1 deletion(-) create mode 100644 src/main/java/org/opendata/curation/d4/experiments/BestGTMatch.java create mode 100644 src/main/java/org/opendata/curation/d4/experiments/GTReader.java create mode 100644 src/main/java/org/opendata/curation/d4/experiments/StrongDomainJsonReader.java diff --git a/pom.xml b/pom.xml index 2506651..9a78ea2 100644 --- a/pom.xml +++ b/pom.xml @@ -74,7 +74,7 @@ - org.opendata.curation.d4.experiments.GTWriter + org.opendata.curation.d4.experiments.BestGTMatch diff --git a/src/main/java/org/opendata/curation/d4/experiments/BestGTMatch.java b/src/main/java/org/opendata/curation/d4/experiments/BestGTMatch.java new file mode 100644 index 0000000..e1060e0 --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/experiments/BestGTMatch.java @@ -0,0 +1,83 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.experiments; + +import java.io.File; +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.opendata.core.metric.F1; +import org.opendata.core.metric.Precision; +import org.opendata.core.metric.Recall; +import org.opendata.core.set.IDSet; +import org.opendata.core.set.MutableIdentifiableIDSet; + +/** + * + * @author Heiko Mueller + */ +public class BestGTMatch { + + private static final String COMMAND = + "Usage:\n" + + " \n" + + " \n" + + " [true | false]"; + + private static final Logger LOGGER = Logger.getLogger(BestGTMatch.class.getName()); + + public static void main(String[] args) { + + if (args.length != 3) { + System.out.println(COMMAND); + System.exit(-1); + } + + File gtDir = new File(args[0]); + File domainDir = new File(args[1]); + boolean firstBlockOnly = Boolean.parseBoolean(args[2]); + + try { + List domains; + domains = new StrongDomainJsonReader().readAll(domainDir, firstBlockOnly); + for (File file : gtDir.listFiles()) { + String name = file.getName().substring(0, file.getName().indexOf(".")); + IDSet gt = new GTReader().read(file); + BigDecimal maxF1 = BigDecimal.ZERO; + for (MutableIdentifiableIDSet domain : domains) { + int ovp = domain.overlap(gt); + if (ovp > 0) { + BigDecimal f1 = new F1( + new Precision(ovp, domain.length()), + new Recall(ovp, gt.length()) + ).value(); + if (maxF1.compareTo(f1) < 0) { + maxF1 = f1; + } + } + } + System.out.println(String.format("%s\t%s", name, maxF1.setScale(6, RoundingMode.HALF_DOWN).toPlainString())); + } + } catch (java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "RUN", ex); + System.exit(-1); + } + } +} diff --git a/src/main/java/org/opendata/curation/d4/experiments/GTReader.java b/src/main/java/org/opendata/curation/d4/experiments/GTReader.java new file mode 100644 index 0000000..ebb59fb --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/experiments/GTReader.java @@ -0,0 +1,45 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.experiments; + +import java.io.BufferedReader; +import java.io.File; +import org.opendata.core.io.FileSystem; +import org.opendata.core.set.HashIDSet; +import org.opendata.core.set.IDSet; + +/** + * + * @author Heiko Mueller + */ +public class GTReader { + + public IDSet read(File file) throws java.io.IOException { + + HashIDSet terms = new HashIDSet(); + + try (BufferedReader in = FileSystem.openReader(file)) { + String line; + while ((line = in.readLine()) != null) { + terms.add(Integer.parseInt(line.split("\t")[1])); + } + } + + return terms; + } +} diff --git a/src/main/java/org/opendata/curation/d4/experiments/StrongDomainJsonReader.java b/src/main/java/org/opendata/curation/d4/experiments/StrongDomainJsonReader.java new file mode 100644 index 0000000..5625039 --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/experiments/StrongDomainJsonReader.java @@ -0,0 +1,79 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.experiments; + +import com.google.gson.JsonArray; +import com.google.gson.JsonParser; +import java.io.File; +import java.io.FileReader; +import java.util.ArrayList; +import java.util.List; +import org.opendata.core.set.HashIDSet; +import org.opendata.core.set.MutableIdentifiableIDSet; + +/** + * + * @author Heiko Mueller + */ +public class StrongDomainJsonReader { + + public MutableIdentifiableIDSet read( + File file, + boolean firstBlockOnly + ) throws java.io.IOException { + + JsonArray blocks; + blocks = new JsonParser() + .parse(new FileReader(file)) + .getAsJsonObject() + .get("terms") + .getAsJsonArray(); + + int domainId = Integer.parseInt(file.getName().substring(0, file.getName().indexOf("."))); + HashIDSet terms = new HashIDSet(); + + int blockCount = 1; + if (!firstBlockOnly) { + blockCount = blocks.size(); + } + for (int iBlock = 0; iBlock < blockCount; iBlock++) { + JsonArray block = blocks.get(iBlock).getAsJsonArray(); + for (int iTerm = 0; iTerm < block.size(); iTerm++) { + int termId = block.get(iTerm).getAsJsonObject().get("id").getAsInt(); + terms.add(termId); + } + } + return new MutableIdentifiableIDSet(domainId, terms); + } + + public List readAll( + File inputDir, + boolean firstBlockOnly + ) throws java.io.IOException { + + List domains = new ArrayList<>(); + + for (File file : inputDir.listFiles()) { + if (file.getName().endsWith(".json")) { + domains.add(this.read(file, firstBlockOnly)); + } + } + + return domains; + } +} From 72cfc0fa220f22a0390a0ccb75aae0eb88cdbe46 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Thu, 29 Oct 2020 12:50:24 -0400 Subject: [PATCH 09/25] Avoid multiple scoring function instances for column support trimmer --- pom.xml | 1 + .../org/opendata/curation/d4/Constants.java | 2 +- .../curation/d4/experiments/BestGTMatch.java | 27 ++++++++++++------- .../d4/export/ExportStrongDomains.java | 2 +- .../d4/signature/ContextSignaturePrinter.java | 2 +- .../d4/signature/trim/CentristTrimmer.java | 20 +++----------- .../trim/SignatureTrimmerFactory.java | 12 ++++++--- 7 files changed, 35 insertions(+), 31 deletions(-) diff --git a/pom.xml b/pom.xml index 9a78ea2..bce55e0 100644 --- a/pom.xml +++ b/pom.xml @@ -75,6 +75,7 @@ org.opendata.curation.d4.experiments.BestGTMatch + diff --git a/src/main/java/org/opendata/curation/d4/Constants.java b/src/main/java/org/opendata/curation/d4/Constants.java index f44534b..84b540e 100644 --- a/src/main/java/org/opendata/curation/d4/Constants.java +++ b/src/main/java/org/opendata/curation/d4/Constants.java @@ -26,5 +26,5 @@ public final class Constants { public static final String NAME = "D4 - Data-Driven Domain Discovery"; - public static final String VERSION = "0.29.0.dev3"; + public static final String VERSION = "0.29.0.dev4"; } \ No newline at end of file diff --git a/src/main/java/org/opendata/curation/d4/experiments/BestGTMatch.java b/src/main/java/org/opendata/curation/d4/experiments/BestGTMatch.java index e1060e0..1b37587 100644 --- a/src/main/java/org/opendata/curation/d4/experiments/BestGTMatch.java +++ b/src/main/java/org/opendata/curation/d4/experiments/BestGTMatch.java @@ -19,7 +19,6 @@ import java.io.File; import java.math.BigDecimal; -import java.math.RoundingMode; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; @@ -28,6 +27,7 @@ import org.opendata.core.metric.Recall; import org.opendata.core.set.IDSet; import org.opendata.core.set.MutableIdentifiableIDSet; +import org.opendata.core.util.FormatedBigDecimal; /** * @@ -54,26 +54,35 @@ public static void main(String[] args) { File domainDir = new File(args[1]); boolean firstBlockOnly = Boolean.parseBoolean(args[2]); + System.out.println("DOMAIN\tPRECISION\tRECALL\tF1"); + try { List domains; domains = new StrongDomainJsonReader().readAll(domainDir, firstBlockOnly); for (File file : gtDir.listFiles()) { String name = file.getName().substring(0, file.getName().indexOf(".")); IDSet gt = new GTReader().read(file); - BigDecimal maxF1 = BigDecimal.ZERO; + BigDecimal[] bestMatch = new BigDecimal[]{BigDecimal.ZERO, BigDecimal.ZERO, BigDecimal.ZERO}; for (MutableIdentifiableIDSet domain : domains) { int ovp = domain.overlap(gt); if (ovp > 0) { - BigDecimal f1 = new F1( - new Precision(ovp, domain.length()), - new Recall(ovp, gt.length()) - ).value(); - if (maxF1.compareTo(f1) < 0) { - maxF1 = f1; + Precision precision = new Precision(ovp, domain.length()); + Recall recall = new Recall(ovp, gt.length()); + BigDecimal f1 = new F1(precision, recall).value(); + if (bestMatch[2].compareTo(f1) < 0) { + bestMatch = new BigDecimal[]{precision.value(), recall.value(), f1}; } } } - System.out.println(String.format("%s\t%s", name, maxF1.setScale(6, RoundingMode.HALF_DOWN).toPlainString())); + System.out.println( + String.format( + "%s\t%s\t%s\t%s", + name, + new FormatedBigDecimal(bestMatch[0]), + new FormatedBigDecimal(bestMatch[1]), + new FormatedBigDecimal(bestMatch[2]) + ) + ); } } catch (java.io.IOException ex) { LOGGER.log(Level.SEVERE, "RUN", ex); diff --git a/src/main/java/org/opendata/curation/d4/export/ExportStrongDomains.java b/src/main/java/org/opendata/curation/d4/export/ExportStrongDomains.java index 7fe24f3..e24296c 100644 --- a/src/main/java/org/opendata/curation/d4/export/ExportStrongDomains.java +++ b/src/main/java/org/opendata/curation/d4/export/ExportStrongDomains.java @@ -252,7 +252,7 @@ public static void main(String[] args) { System.out.println(Constants.NAME + " - Export Strong Domains - Version (" + Constants.VERSION + ")\n"); - if (args.length != 5) { + if (args.length != 6) { System.out.println(COMMAND); System.exit(-1); } diff --git a/src/main/java/org/opendata/curation/d4/signature/ContextSignaturePrinter.java b/src/main/java/org/opendata/curation/d4/signature/ContextSignaturePrinter.java index 0002088..b4b2c8d 100644 --- a/src/main/java/org/opendata/curation/d4/signature/ContextSignaturePrinter.java +++ b/src/main/java/org/opendata/curation/d4/signature/ContextSignaturePrinter.java @@ -155,7 +155,7 @@ public void print( RobustSignatureIndex buffer = new RobustSignatureIndex(); new LiberalTrimmer( nodeSizes, - new CentristTrimmer(eqIndex, eqIndex.columns(), column, buffer) + new CentristTrimmer(column, new PrecisionScore(eqIndex, eqIndex.columns()), buffer) ).consume(new SignatureBlocksImpl(nodeId, BigDecimal.ONE, blocks)); System.out.println("\nSIGNATURE BLOCKS FOR COLUMN " + column.id() + "\n"); SignatureBlocks sigBlocks = buffer.get(nodeId); diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java b/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java index e015659..a90b076 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java @@ -32,10 +32,7 @@ import org.opendata.core.set.HashIDSet; import org.opendata.core.set.IDSet; import org.opendata.core.set.IdentifiableIDSet; -import org.opendata.core.set.IdentifiableObjectSet; import org.opendata.core.sort.DoubleValueDescSort; -import org.opendata.db.column.Column; -import org.opendata.db.eq.EQIndex; /** * Centrist signature blocks trimmer. The centrist trimmer uses a scoring @@ -66,16 +63,15 @@ public CentristTrimmer( } public CentristTrimmer( - EQIndex eqIndex, - IdentifiableObjectSet columns, IdentifiableIDSet column, + BlockScoreFunction scoreFunc, Threshold nonEmptyConstraint, SignatureBlocksConsumer consumer ) { this( column, - new PrecisionScore(eqIndex, columns), + scoreFunc, new MaxDropFinder<>( new GreaterThanConstraint(BigDecimal.ZERO), false, @@ -87,16 +83,14 @@ public CentristTrimmer( } public CentristTrimmer( - EQIndex eqIndex, - IdentifiableObjectSet columns, IdentifiableIDSet column, + BlockScoreFunction scoreFunc, SignatureBlocksConsumer consumer ) { this( - eqIndex, - columns, column, + scoreFunc, new ZeroThreshold(), consumer ); @@ -110,12 +104,6 @@ public CentristTrimmer( this( column, scoreFunc, - new MaxDropFinder<>( - new GreaterThanConstraint(BigDecimal.ZERO), - false, - false - ), - new ZeroThreshold(), null ); } diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java index ea626db..a147970 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java @@ -33,6 +33,7 @@ public class SignatureTrimmerFactory { private final IdentifiableObjectSet _columns; private final EQIndex _nodes; + private PrecisionScore _scoreFunc = null; private final String _trimmerSpec; public SignatureTrimmerFactory( @@ -77,14 +78,19 @@ public SignatureTrimmer getTrimmer( if (_trimmerSpec.equals(SignatureTrimmer.CONSERVATIVE)) { return new ConservativeTrimmer(column, consumer); } else if (_trimmerSpec.equals(SignatureTrimmer.CENTRIST)) { - return new CentristTrimmer(_nodes, _columns, column, consumer); + if (_scoreFunc == null) { + _scoreFunc = new PrecisionScore(_nodes, _columns); + } + return new CentristTrimmer(column, _scoreFunc, consumer); } else if (_trimmerSpec.startsWith(SignatureTrimmer.CENTRIST)) { int pos = _trimmerSpec.indexOf(":"); if (pos != -1) { + if (_scoreFunc == null) { + _scoreFunc = new PrecisionScore(_nodes, _columns); + } return new CentristTrimmer( - _nodes, - _columns, column, + _scoreFunc, Threshold.getConstraint(_trimmerSpec.substring(pos + 1)), consumer ); From 3dc60802a19025c3edd0bf03e1e4438b76c3915e Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Fri, 30 Oct 2020 18:07:14 -0400 Subject: [PATCH 10/25] Add best match finder for local domains --- pom.xml | 3 +- .../org/opendata/core/io/EntitySetReader.java | 2 +- .../org/opendata/curation/d4/Constants.java | 2 +- .../d4/experiments/BestGTLocalMatch.java | 137 ++++++++++++++++++ .../curation/d4/experiments/BestGTMatch.java | 19 ++- .../d4/explore/ColumnLocalDomainPrinter.java | 122 ++++++++++++++++ 6 files changed, 278 insertions(+), 7 deletions(-) create mode 100644 src/main/java/org/opendata/curation/d4/experiments/BestGTLocalMatch.java create mode 100644 src/main/java/org/opendata/curation/d4/explore/ColumnLocalDomainPrinter.java diff --git a/pom.xml b/pom.xml index bce55e0..f62081a 100644 --- a/pom.xml +++ b/pom.xml @@ -74,7 +74,8 @@ - org.opendata.curation.d4.experiments.BestGTMatch + org.opendata.curation.d4.experiments.BestGTLocalMatch + diff --git a/src/main/java/org/opendata/core/io/EntitySetReader.java b/src/main/java/org/opendata/core/io/EntitySetReader.java index ddae68b..b75b4b5 100644 --- a/src/main/java/org/opendata/core/io/EntitySetReader.java +++ b/src/main/java/org/opendata/core/io/EntitySetReader.java @@ -65,7 +65,7 @@ public void read( public void read(EntityConsumer consumer) throws java.io.IOException { - this.read(new AnyObjectFilter(), consumer); + this.read(new AnyObjectFilter<>(), consumer); } public EntitySet readEntities(ObjectFilter filter) throws java.io.IOException { diff --git a/src/main/java/org/opendata/curation/d4/Constants.java b/src/main/java/org/opendata/curation/d4/Constants.java index 84b540e..2898918 100644 --- a/src/main/java/org/opendata/curation/d4/Constants.java +++ b/src/main/java/org/opendata/curation/d4/Constants.java @@ -26,5 +26,5 @@ public final class Constants { public static final String NAME = "D4 - Data-Driven Domain Discovery"; - public static final String VERSION = "0.29.0.dev4"; + public static final String VERSION = "0.29.0.dev5"; } \ No newline at end of file diff --git a/src/main/java/org/opendata/curation/d4/experiments/BestGTLocalMatch.java b/src/main/java/org/opendata/curation/d4/experiments/BestGTLocalMatch.java new file mode 100644 index 0000000..74e19eb --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/experiments/BestGTLocalMatch.java @@ -0,0 +1,137 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.experiments; + +import java.io.File; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.opendata.core.metric.F1; +import org.opendata.core.metric.Precision; +import org.opendata.core.metric.Recall; +import org.opendata.core.set.HashIDSet; +import org.opendata.core.set.IDSet; +import org.opendata.core.util.FormatedBigDecimal; +import org.opendata.curation.d4.domain.Domain; +import org.opendata.curation.d4.domain.DomainReader; +import org.opendata.db.eq.EQIndex; + +/** + * Best match for ground truth domains against all local domains. + * + * @author Heiko Mueller + */ +public class BestGTLocalMatch { + + private static final String COMMAND = + "Usage:\n" + + " \n" + + " \n" + + " "; + + private static final Logger LOGGER = Logger.getLogger(BestGTLocalMatch.class.getName()); + + public static void main(String[] args) { + + if (args.length != 3) { + System.out.println(COMMAND); + System.exit(-1); + } + + File eqFile = new File(args[0]); + File gtDir = new File(args[1]); + File localDomainFile = new File(args[2]); + + System.out.println("DOMAIN\tPRECISION\tRECALL\tF1"); + + HashMap groundTruths = new HashMap<>(); + try { + for (File file : gtDir.listFiles()) { + String name = file.getName().substring(0, file.getName().indexOf(".")); + IDSet gt = new GTReader().read(file); + groundTruths.put(name, gt); + } + } catch (java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "RUN", ex); + System.exit(-1); + } + + HashMap bestMatches = new HashMap<>(); + for (String key : groundTruths.keySet()) { + BigDecimal[] matchInfo = new BigDecimal[]{ + BigDecimal.ZERO, + BigDecimal.ZERO, + BigDecimal.ZERO + }; + bestMatches.put(key, matchInfo); + } + + try { + EQIndex eqIndex = new EQIndex(eqFile); + for (Domain domain : new DomainReader(localDomainFile).read()) { + HashIDSet terms = new HashIDSet(); + for (int nodeId : domain) { + terms.add(eqIndex.get(nodeId).terms()); + if (terms.length() > 100000) { + break; + } + } + if (terms.length() > 100000) { + continue; + } + for (String key : groundTruths.keySet()) { + IDSet gt = groundTruths.get(key); + int ovp = terms.overlap(gt); + if (ovp > 0) { + Precision precision = new Precision(ovp, terms.length()); + Recall recall = new Recall(ovp, gt.length()); + BigDecimal f1 = new F1(precision, recall).value(); + BigDecimal[] bestMatch = bestMatches.get(key); + if (bestMatch[2].compareTo(f1) < 0) { + bestMatch[0] = precision.value(); + bestMatch[1] = recall.value(); + bestMatch[2] = f1; + } + } + } + } + } catch (java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "RUN", ex); + System.exit(-1); + } + + List names = new ArrayList<>(groundTruths.keySet()); + Collections.sort(names); + for (String name : names) { + BigDecimal[] bestMatch = bestMatches.get(name); + System.out.println( + String.format( + "%s\t%s\t%s\t%s", + name, + new FormatedBigDecimal(bestMatch[0]), + new FormatedBigDecimal(bestMatch[1]), + new FormatedBigDecimal(bestMatch[2]) + ) + ); + } + } +} diff --git a/src/main/java/org/opendata/curation/d4/experiments/BestGTMatch.java b/src/main/java/org/opendata/curation/d4/experiments/BestGTMatch.java index 1b37587..31af472 100644 --- a/src/main/java/org/opendata/curation/d4/experiments/BestGTMatch.java +++ b/src/main/java/org/opendata/curation/d4/experiments/BestGTMatch.java @@ -54,7 +54,7 @@ public static void main(String[] args) { File domainDir = new File(args[1]); boolean firstBlockOnly = Boolean.parseBoolean(args[2]); - System.out.println("DOMAIN\tPRECISION\tRECALL\tF1"); + System.out.println("DOMAIN\tID\tPRECISION\tRECALL\tF1"); try { List domains; @@ -62,7 +62,12 @@ public static void main(String[] args) { for (File file : gtDir.listFiles()) { String name = file.getName().substring(0, file.getName().indexOf(".")); IDSet gt = new GTReader().read(file); - BigDecimal[] bestMatch = new BigDecimal[]{BigDecimal.ZERO, BigDecimal.ZERO, BigDecimal.ZERO}; + BigDecimal[] bestMatch = new BigDecimal[]{ + BigDecimal.ZERO, + BigDecimal.ZERO, + BigDecimal.ZERO + }; + int bestMatchID = -1; for (MutableIdentifiableIDSet domain : domains) { int ovp = domain.overlap(gt); if (ovp > 0) { @@ -70,14 +75,20 @@ public static void main(String[] args) { Recall recall = new Recall(ovp, gt.length()); BigDecimal f1 = new F1(precision, recall).value(); if (bestMatch[2].compareTo(f1) < 0) { - bestMatch = new BigDecimal[]{precision.value(), recall.value(), f1}; + bestMatch = new BigDecimal[]{ + precision.value(), + recall.value(), + f1 + }; + bestMatchID = domain.id(); } } } System.out.println( String.format( - "%s\t%s\t%s\t%s", + "%s\t%d\t%s\t%s\t%s", name, + bestMatchID, new FormatedBigDecimal(bestMatch[0]), new FormatedBigDecimal(bestMatch[1]), new FormatedBigDecimal(bestMatch[2]) diff --git a/src/main/java/org/opendata/curation/d4/explore/ColumnLocalDomainPrinter.java b/src/main/java/org/opendata/curation/d4/explore/ColumnLocalDomainPrinter.java new file mode 100644 index 0000000..91982ae --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/explore/ColumnLocalDomainPrinter.java @@ -0,0 +1,122 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.explore; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.opendata.core.io.EntitySetReader; +import org.opendata.core.object.Entity; +import org.opendata.core.set.HashIDSet; +import org.opendata.core.set.IdentifiableObjectSet; +import org.opendata.core.sort.NamedObjectComparator; +import org.opendata.curation.d4.domain.Domain; +import org.opendata.curation.d4.domain.DomainReader; +import org.opendata.db.eq.EQ; +import org.opendata.db.eq.EQIndex; + +/** + * Print local domains for a column. + * + * @author Heiko Mueller + */ +public class ColumnLocalDomainPrinter { + + public void run( + EQIndex eqIndex, + EntitySetReader termReader, + IdentifiableObjectSet localDomains, + int columnId + ) throws java.io.IOException { + + HashIDSet columnTerms = new HashIDSet(); + for (int nodeId : eqIndex.columns().get(columnId)) { + columnTerms.add(eqIndex.get(nodeId).terms()); + } + + List columnDomains = new ArrayList<>(); + HashIDSet domainTerms = new HashIDSet(); + for (Domain domain : localDomains) { + if (domain.columns().contains(columnId)) { + HashIDSet dom = new HashIDSet(); + for (int nodeId : domain) { + EQ node = eqIndex.get(nodeId); + dom.add(node.terms()); + domainTerms.add(node.terms()); + } + columnDomains.add(dom); + } + } + + List terms = termReader.readEntities(domainTerms).toList(); + Collections.sort(terms, new NamedObjectComparator()); + + for (Entity term : terms) { + String line = term.name(); + if (!columnTerms.contains(term.id())) { + line += " (*)"; + } + for (HashIDSet domain : columnDomains) { + if (domain.contains(term.id())) { + line += "\tX"; + } else { + line += "\t-"; + } + } + System.out.println(line); + } + } + + private static final String COMMAND = + "Usage:\n" + + " \n" + + " \n" + + " \n" + + " "; + + private static final Logger LOGGER = Logger + .getLogger(ColumnLocalDomainPrinter.class.getName()); + + public static void main(String[] args) { + + if (args.length != 4) { + System.out.println(COMMAND); + System.exit(-1); + } + + File eqFile = new File(args[0]); + File termFile = new File(args[1]); + File localDomainFile = new File(args[2]); + int columnId = Integer.parseInt(args[3]); + + try { + new ColumnLocalDomainPrinter().run( + new EQIndex(eqFile), + new EntitySetReader(termFile), + new DomainReader(localDomainFile).read(), + columnId + ); + } catch (java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "RUN", ex); + System.exit(-1); + } + } +} From 4e2881991c025d5ca477621f0a8a0abda0fd1de4 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Tue, 3 Nov 2020 22:04:16 -0500 Subject: [PATCH 11/25] Add domain id and top-k option for best local domain match finder --- .../d4/experiments/BestGTLocalMatch.java | 143 ++++++++++-------- .../curation/d4/experiments/BestMatch.java | 69 +++++++++ 2 files changed, 152 insertions(+), 60 deletions(-) create mode 100644 src/main/java/org/opendata/curation/d4/experiments/BestMatch.java diff --git a/src/main/java/org/opendata/curation/d4/experiments/BestGTLocalMatch.java b/src/main/java/org/opendata/curation/d4/experiments/BestGTLocalMatch.java index 74e19eb..8c4969f 100644 --- a/src/main/java/org/opendata/curation/d4/experiments/BestGTLocalMatch.java +++ b/src/main/java/org/opendata/curation/d4/experiments/BestGTLocalMatch.java @@ -18,19 +18,17 @@ package org.opendata.curation.d4.experiments; import java.io.File; -import java.math.BigDecimal; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; -import org.opendata.core.metric.F1; import org.opendata.core.metric.Precision; import org.opendata.core.metric.Recall; import org.opendata.core.set.HashIDSet; import org.opendata.core.set.IDSet; -import org.opendata.core.util.FormatedBigDecimal; +import org.opendata.core.set.IdentifiableObjectSet; import org.opendata.curation.d4.domain.Domain; import org.opendata.curation.d4.domain.DomainReader; import org.opendata.db.eq.EQIndex; @@ -42,17 +40,88 @@ */ public class BestGTLocalMatch { + public void run( + EQIndex eqIndex, + HashMap groundTruths, + IdentifiableObjectSet domains, + int k + ) { + + HashMap bestMatches = new HashMap<>(); + for (String key : groundTruths.keySet()) { + bestMatches.put(key, new BestMatch[k]); + } + + for (Domain domain : domains) { + HashIDSet terms = new HashIDSet(); + for (int nodeId : domain) { + terms.add(eqIndex.get(nodeId).terms()); + if (terms.length() > 100000) { + break; + } + } + if (terms.length() > 100000) { + continue; + } + for (String key : groundTruths.keySet()) { + IDSet gt = groundTruths.get(key); + int ovp = terms.overlap(gt); + if (ovp > 0) { + Precision precision = new Precision(ovp, terms.length()); + Recall recall = new Recall(ovp, gt.length()); + BestMatch match = new BestMatch(domain.id(), precision, recall); + BestMatch[] gtMatches = bestMatches.get(key); + for (int iMatch = 0; iMatch < gtMatches.length; iMatch++) { + if (gtMatches[iMatch] == null) { + gtMatches[iMatch] = match; + break; + } else if (gtMatches[iMatch].f1().compareTo(match.f1()) < 0) { + for (int jMatch = gtMatches.length - 2; jMatch >= iMatch; jMatch--) { + gtMatches[jMatch + 1] = gtMatches[jMatch]; + } + gtMatches[iMatch] = match; + break; + } + } + } + } + } + + List names = new ArrayList<>(groundTruths.keySet()); + Collections.sort(names); + for (String name : names) { + BestMatch[] gtMatches = bestMatches.get(name); + for (BestMatch match : gtMatches) { + if (match == null) { + break; + } + System.out.println( + String.format( + "%s\t%d\t%s\t%s\t%s", + name, + match.domainId(), + match.precision().toString(), + match.recall().toString(), + match.f1().toString() + ) + ); + } + } + } + private static final String COMMAND = "Usage:\n" + " \n" + " \n" + - " "; + " \n" + + " "; - private static final Logger LOGGER = Logger.getLogger(BestGTLocalMatch.class.getName()); + private static final Logger LOGGER = Logger + .getLogger(BestGTLocalMatch.class.getName()); public static void main(String[] args) { - if (args.length != 3) { + if (args.length != 4) { System.out.println(COMMAND); System.exit(-1); } @@ -60,8 +129,9 @@ public static void main(String[] args) { File eqFile = new File(args[0]); File gtDir = new File(args[1]); File localDomainFile = new File(args[2]); + int k = Integer.parseInt(args[3]); - System.out.println("DOMAIN\tPRECISION\tRECALL\tF1"); + System.out.println("GT\tID\tPRECISION\tRECALL\tF1"); HashMap groundTruths = new HashMap<>(); try { @@ -75,63 +145,16 @@ public static void main(String[] args) { System.exit(-1); } - HashMap bestMatches = new HashMap<>(); - for (String key : groundTruths.keySet()) { - BigDecimal[] matchInfo = new BigDecimal[]{ - BigDecimal.ZERO, - BigDecimal.ZERO, - BigDecimal.ZERO - }; - bestMatches.put(key, matchInfo); - } - try { - EQIndex eqIndex = new EQIndex(eqFile); - for (Domain domain : new DomainReader(localDomainFile).read()) { - HashIDSet terms = new HashIDSet(); - for (int nodeId : domain) { - terms.add(eqIndex.get(nodeId).terms()); - if (terms.length() > 100000) { - break; - } - } - if (terms.length() > 100000) { - continue; - } - for (String key : groundTruths.keySet()) { - IDSet gt = groundTruths.get(key); - int ovp = terms.overlap(gt); - if (ovp > 0) { - Precision precision = new Precision(ovp, terms.length()); - Recall recall = new Recall(ovp, gt.length()); - BigDecimal f1 = new F1(precision, recall).value(); - BigDecimal[] bestMatch = bestMatches.get(key); - if (bestMatch[2].compareTo(f1) < 0) { - bestMatch[0] = precision.value(); - bestMatch[1] = recall.value(); - bestMatch[2] = f1; - } - } - } - } + new BestGTLocalMatch().run( + new EQIndex(eqFile), + groundTruths, + new DomainReader(localDomainFile).read(), + k + ); } catch (java.io.IOException ex) { LOGGER.log(Level.SEVERE, "RUN", ex); System.exit(-1); } - - List names = new ArrayList<>(groundTruths.keySet()); - Collections.sort(names); - for (String name : names) { - BigDecimal[] bestMatch = bestMatches.get(name); - System.out.println( - String.format( - "%s\t%s\t%s\t%s", - name, - new FormatedBigDecimal(bestMatch[0]), - new FormatedBigDecimal(bestMatch[1]), - new FormatedBigDecimal(bestMatch[2]) - ) - ); - } } } diff --git a/src/main/java/org/opendata/curation/d4/experiments/BestMatch.java b/src/main/java/org/opendata/curation/d4/experiments/BestMatch.java new file mode 100644 index 0000000..7610936 --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/experiments/BestMatch.java @@ -0,0 +1,69 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.experiments; + +import org.opendata.core.metric.F1; +import org.opendata.core.metric.Precision; +import org.opendata.core.metric.Recall; + +/** + * Information about a best match with a ground truth domain. + * + * @author heiko + */ +public class BestMatch { + + private final int _domainId; + private final F1 _f1; + private final Precision _precision; + private final Recall _recall; + + public BestMatch(int domainId, Precision precision, Recall recall) { + + _domainId = domainId; + _precision = precision; + _recall = recall; + _f1 = new F1(precision, recall); +; + } + + public BestMatch() { + + this(-1, new Precision(), new Recall()); + } + + public int domainId() { + + return _domainId; + } + + public F1 f1() { + + return _f1; + } + + public Precision precision() { + + return _precision; + } + + public Recall recall() { + + return _recall; + } +} From c4644fa8041d9397c3bfaf47c9d0e2453593865d Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Fri, 27 Nov 2020 21:38:36 -0500 Subject: [PATCH 12/25] Add multi-pass local domain generator --- pom.xml | 8 +- .../org/opendata/curation/d4/Constants.java | 2 +- .../java/org/opendata/curation/d4/D4.java | 37 ++-- .../domain/MultiScanLocalDomainGenerator.java | 179 +++++++++++++++++ ...va => SingleScanLocalDomainGenerator.java} | 12 +- .../d4/experiments/BestGTAllMatch.java | 104 ++++++++++ .../curation/d4/experiments/GTReader.java | 3 +- .../SignatureComputationExperiment.java | 96 +++++++++ .../experiments/StrongDomainJsonReader.java | 38 ++++ .../CosineSimContextSignaturePrinter.java | 190 ++++++++++++++++++ .../d4/explore/GTDisjointTermsPrinter.java | 100 +++++++++ .../d4/signature/ContextSignaturePrinter.java | 90 +-------- .../signature/SignatureBlocksGenerator.java | 56 +++++- .../SignatureBlocksIndexFactory.java | 58 ------ .../d4/signature/SignatureBlocksStats.java | 54 +++-- .../SignatureBlocksWriterFactory.java | 103 ---------- .../org/opendata/db/eq/DefaultEQFactory.java | 33 +++ .../java/org/opendata/db/eq/EQFactory.java | 29 +++ src/main/java/org/opendata/db/eq/EQImpl.java | 2 +- src/main/java/org/opendata/db/eq/EQIndex.java | 5 + .../java/org/opendata/db/eq/EQReader.java | 11 +- .../java/org/opendata/db/eq/LazyParseEQ.java | 69 +++++++ .../opendata/db/eq/LazyParseEQFactory.java | 34 ++++ .../eq/LazyParseEQIndex.java} | 16 +- .../db/eq/SimilarTermIndexGenerator.java | 153 ++++++++++++++ 25 files changed, 1171 insertions(+), 311 deletions(-) create mode 100644 src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java rename src/main/java/org/opendata/curation/d4/domain/{ParallelLocalDomainGenerator.java => SingleScanLocalDomainGenerator.java} (93%) create mode 100644 src/main/java/org/opendata/curation/d4/experiments/BestGTAllMatch.java create mode 100644 src/main/java/org/opendata/curation/d4/experiments/SignatureComputationExperiment.java create mode 100644 src/main/java/org/opendata/curation/d4/explore/CosineSimContextSignaturePrinter.java create mode 100644 src/main/java/org/opendata/curation/d4/explore/GTDisjointTermsPrinter.java delete mode 100644 src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndexFactory.java delete mode 100644 src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriterFactory.java create mode 100644 src/main/java/org/opendata/db/eq/DefaultEQFactory.java create mode 100644 src/main/java/org/opendata/db/eq/EQFactory.java create mode 100644 src/main/java/org/opendata/db/eq/LazyParseEQ.java create mode 100644 src/main/java/org/opendata/db/eq/LazyParseEQFactory.java rename src/main/java/org/opendata/{curation/d4/signature/SignatureBlocksConsumerFactory.java => db/eq/LazyParseEQIndex.java} (72%) create mode 100644 src/main/java/org/opendata/db/eq/SimilarTermIndexGenerator.java diff --git a/pom.xml b/pom.xml index f62081a..6aea5eb 100644 --- a/pom.xml +++ b/pom.xml @@ -70,12 +70,14 @@ true - + org.opendata.curation.d4.D4 + - org.opendata.curation.d4.experiments.BestGTLocalMatch - + + + diff --git a/src/main/java/org/opendata/curation/d4/Constants.java b/src/main/java/org/opendata/curation/d4/Constants.java index 2898918..ecf8907 100644 --- a/src/main/java/org/opendata/curation/d4/Constants.java +++ b/src/main/java/org/opendata/curation/d4/Constants.java @@ -26,5 +26,5 @@ public final class Constants { public static final String NAME = "D4 - Data-Driven Domain Discovery"; - public static final String VERSION = "0.29.0.dev5"; + public static final String VERSION = "0.29.0.dev7"; } \ No newline at end of file diff --git a/src/main/java/org/opendata/curation/d4/D4.java b/src/main/java/org/opendata/curation/d4/D4.java index 0dc06e1..f00e2d4 100644 --- a/src/main/java/org/opendata/curation/d4/D4.java +++ b/src/main/java/org/opendata/curation/d4/D4.java @@ -34,7 +34,7 @@ import org.opendata.curation.d4.domain.DomainReader; import org.opendata.curation.d4.domain.DomainSetStatsPrinter; import org.opendata.curation.d4.domain.DomainWriter; -import org.opendata.curation.d4.domain.ParallelLocalDomainGenerator; +import org.opendata.curation.d4.domain.SingleScanLocalDomainGenerator; import org.opendata.curation.d4.domain.StrongDomainGenerator; import org.opendata.curation.d4.signature.SignatureBlocksGenerator; import org.opendata.curation.d4.signature.SignatureBlocksStats; @@ -43,6 +43,7 @@ import org.opendata.core.io.FileSystem; import org.opendata.core.set.IdentifiableObjectSet; import org.opendata.curation.d4.domain.Domain; +import org.opendata.curation.d4.domain.MultiScanLocalDomainGenerator; import org.opendata.curation.d4.domain.StrongDomain; import org.opendata.curation.d4.domain.StrongDomainReader; import org.opendata.curation.d4.domain.StrongDomainWriter; @@ -50,7 +51,7 @@ import org.opendata.curation.d4.export.PrimaryDomainWriter; import org.opendata.curation.d4.signature.SignatureBlocksReader; import org.opendata.curation.d4.signature.SignatureBlocksStream; -import org.opendata.curation.d4.signature.SignatureBlocksWriterFactory; +import org.opendata.curation.d4.signature.SignatureBlocksWriter; import org.opendata.curation.d4.signature.trim.SignatureTrimmer; import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; import org.opendata.db.column.Column; @@ -124,9 +125,10 @@ public void exportStrongDomains( public void localDomains( EQIndex nodeIndex, File columnsFile, - SignatureBlocksStream signatures, + SignatureBlocksReader signatures, String trimmer, int threads, + boolean singleScan, boolean verbose, TelemetryCollector telemetry, File outputFile @@ -134,15 +136,27 @@ public void localDomains( ExpandedColumnIndex columnIndex = new ExpandedColumnIndex(); new ExpandedColumnReader(columnsFile).stream(columnIndex); - new ParallelLocalDomainGenerator(telemetry).run( + if (singleScan) { + new SingleScanLocalDomainGenerator(telemetry).run( + nodeIndex, + columnIndex, + signatures, + trimmer, + threads, + verbose, + new DomainWriter(outputFile) + ); + } else { + new MultiScanLocalDomainGenerator(telemetry).run( nodeIndex, columnIndex, - signatures, + signatures.read(), trimmer, threads, verbose, new DomainWriter(outputFile) ); + } if (verbose) { DomainSetStatsPrinter localStats = new DomainSetStatsPrinter(); @@ -162,12 +176,7 @@ public void signatures( File outputFile ) throws java.lang.InterruptedException, java.io.IOException { - SignatureBlocksWriterFactory sigWriter; - sigWriter = new SignatureBlocksWriterFactory( - outputFile, - new SignatureTrimmerFactory(nodeIndex, nodeIndex.columns(), trimmerSpec), - false - ); + SignatureBlocksWriter sigWriter = new SignatureBlocksWriter(outputFile); new SignatureBlocksGenerator(telemetry).runWithMaxDrop( nodeIndex, new ConcurrentLinkedQueue<>(nodeIndex.keys().toList()), @@ -175,7 +184,8 @@ public void signatures( ignoreLastDrop, threads, verbose, - sigWriter + new SignatureTrimmerFactory(nodeIndex, nodeIndex.columns(), trimmerSpec) + .getTrimmer(sigWriter) ); if (verbose) { @@ -470,6 +480,7 @@ public static void main(String[] args) { " [default: " + SignatureTrimmer.CENTRIST + "]" ), new Parameter("threads", " [default: 6]"), + new Parameter("singlescan", " [default: false]"), new Parameter("verbose", " [default: true]"), new Parameter( "localdomains", @@ -483,6 +494,7 @@ public static void main(String[] args) { File signatureFile = params.getAsFile("signatures", "signatures.txt.gz"); String trimmer = params.getAsString("trimmer", SignatureTrimmer.CENTRIST); int threads = params.getAsInt("threads", 6); + boolean singleScan = params.getAsBool("singlescan", false); boolean verbose = params.getAsBool("verbose", true); File localDomainFile = params.getAsFile("localdomains", "local-domains.txt.gz"); try { @@ -492,6 +504,7 @@ public static void main(String[] args) { new SignatureBlocksReader(signatureFile), trimmer, threads, + singleScan, verbose, new TelemetryPrinter(), localDomainFile diff --git a/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java new file mode 100644 index 0000000..99a63a1 --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java @@ -0,0 +1,179 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.domain; + +import java.util.Date; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import org.opendata.curation.d4.telemetry.TelemetryCollector; +import org.opendata.curation.d4.telemetry.TelemetryPrinter; +import org.opendata.curation.d4.column.ExpandedColumn; +import org.opendata.curation.d4.column.ExpandedColumnIndex; +import org.opendata.curation.d4.signature.SignatureBlocksConsumer; +import org.opendata.curation.d4.signature.SignatureBlocksStream; +import org.opendata.curation.d4.signature.trim.SignatureTrimmer; +import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; +import org.opendata.core.set.MutableIdentifiableIDSet; +import org.opendata.core.util.MemUsagePrinter; +import org.opendata.db.eq.EQIndex; + +/** + * Generator for local domains using undirected graphs. Each connected component + * in the graph generated from the robust signatures of the column elements + * represents a local domain. + * + * The multi scan local domain generator reads the set of signature blocks into + * main memory and scans through the set for each column. + * + * @author Heiko Mueller + */ +public class MultiScanLocalDomainGenerator { + + public static final String TELEMETRY_ID = "LOCAL DOMAINS"; + + private class DomainGeneratorTask implements Runnable { + + private final ConcurrentLinkedQueue _columns; + private final UniqueDomainSet _domains; + private final int _id; + private final EQIndex _nodes; + private final SignatureBlocksStream _signatures; + private final SignatureTrimmerFactory _trimmerFactory; + private final boolean _verbose; + + public DomainGeneratorTask( + int id, + EQIndex nodes, + ConcurrentLinkedQueue columns, + SignatureBlocksStream signatures, + SignatureTrimmerFactory trimmerFactory, + UniqueDomainSet domains, + boolean verbose + ) { + _id = id; + _nodes = nodes; + _columns = columns; + _signatures = signatures; + _trimmerFactory = trimmerFactory; + _domains = domains; + _verbose = verbose; + } + + @Override + public void run() { + + Date start = new Date(); + + ExpandedColumn column; + while ((column = _columns.poll()) != null) { + MutableIdentifiableIDSet col; + col = new MutableIdentifiableIDSet(column.id(), column.nodes()); + SignatureBlocksConsumer domainGenerator; + domainGenerator = new UndirectedDomainGenerator( + column, + _domains, + _nodes.nodeSizes() + ); + SignatureTrimmer trimmer; + trimmer = _trimmerFactory.getTrimmer(col, domainGenerator); + try { + _signatures.stream(trimmer); + } catch (java.io.IOException ex) { + throw new RuntimeException(ex); + } + } + + Date end = new Date(); + + long execTime = end.getTime() - start.getTime(); + + if (_verbose) { + System.out.println(_id + " DONE WITH " + _columns.size() + " COLUMNS IN " + execTime + " ms"); + } + } + } + + private final TelemetryCollector _telemetry; + + public MultiScanLocalDomainGenerator(TelemetryCollector telemetry) { + + _telemetry = telemetry; + } + + public MultiScanLocalDomainGenerator() { + + this(new TelemetryPrinter()); + } + + public void run( + EQIndex nodes, + ExpandedColumnIndex columnIndex, + SignatureBlocksStream signatures, + String trimmer, + int threads, + boolean verbose, + DomainConsumer consumer + ) throws java.io.IOException { + + UniqueDomainSet domains = new UniqueDomainSet(columnIndex); + + Date start = new Date(); + if (verbose) { + System.out.println("START @ " + start); + new MemUsagePrinter().print(); + } + + ExecutorService es = Executors.newCachedThreadPool(); + + ConcurrentLinkedQueue queue; + queue = new ConcurrentLinkedQueue<>(columnIndex.columns()); + + SignatureTrimmerFactory trimmerFactory; + trimmerFactory = new SignatureTrimmerFactory(nodes, columnIndex.toColumns(), trimmer); + + for (int iThread = 0; iThread < threads; iThread++) { + DomainGeneratorTask task = new DomainGeneratorTask( + iThread, + nodes, + queue, + signatures, + trimmerFactory, + domains, + verbose + ); + es.execute(task); + } + es.shutdown(); + try { + es.awaitTermination(threads, TimeUnit.DAYS); + } catch (java.lang.InterruptedException ex) { + throw new RuntimeException(ex); + } + + domains.stream(consumer); + + Date end = new Date(); + if (verbose) { + System.out.println("END @ " + end); + long execTime = end.getTime() - start.getTime(); + _telemetry.add(TELEMETRY_ID, execTime); + } + } +} diff --git a/src/main/java/org/opendata/curation/d4/domain/ParallelLocalDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java similarity index 93% rename from src/main/java/org/opendata/curation/d4/domain/ParallelLocalDomainGenerator.java rename to src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java index 5e6d096..3711b87 100644 --- a/src/main/java/org/opendata/curation/d4/domain/ParallelLocalDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java @@ -32,7 +32,6 @@ import org.opendata.curation.d4.signature.SignatureBlocksStream; import org.opendata.curation.d4.signature.trim.SignatureTrimmer; import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; -import org.opendata.core.set.IDSet; import org.opendata.core.set.MutableIdentifiableIDSet; import org.opendata.curation.d4.signature.SignatureBlocksDispatcher; import org.opendata.db.eq.EQIndex; @@ -42,11 +41,14 @@ * in the graph generated from the robust signatures of the column elements * represents a local domain. * - * Uses a concurrent queue to distribute columns across workers. Relies + * The single scan local domain generator scans through the set of signature + * blocks exactly once (per thread) while generating the local domains. Requires + * to have domain generators for all columns in memory (instead of having a + * copy of all signature blocks in memory). * * @author Heiko Mueller */ -public class ParallelLocalDomainGenerator { +public class SingleScanLocalDomainGenerator { public static final String TELEMETRY_ID = "LOCAL DOMAINS"; @@ -118,12 +120,12 @@ public void run() { private final TelemetryCollector _telemetry; - public ParallelLocalDomainGenerator(TelemetryCollector telemetry) { + public SingleScanLocalDomainGenerator(TelemetryCollector telemetry) { _telemetry = telemetry; } - public ParallelLocalDomainGenerator() { + public SingleScanLocalDomainGenerator() { this(new TelemetryPrinter()); } diff --git a/src/main/java/org/opendata/curation/d4/experiments/BestGTAllMatch.java b/src/main/java/org/opendata/curation/d4/experiments/BestGTAllMatch.java new file mode 100644 index 0000000..20a2b05 --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/experiments/BestGTAllMatch.java @@ -0,0 +1,104 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.experiments; + +import java.io.File; +import java.math.BigDecimal; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.opendata.core.metric.F1; +import org.opendata.core.metric.Precision; +import org.opendata.core.metric.Recall; +import org.opendata.core.set.HashIDSet; +import org.opendata.core.set.IDSet; +import org.opendata.core.util.FormatedBigDecimal; + +/** + * Find best match of discovered strong domains with ground-truth domains over + * all strong domain block sub-sequences. + * + * @author Heiko Mueller + */ +public class BestGTAllMatch { + + private static final String COMMAND = + "Usage:\n" + + " \n" + + " "; + + private static final Logger LOGGER = Logger.getLogger(BestGTAllMatch.class.getName()); + + public static void main(String[] args) { + + if (args.length != 2) { + System.out.println(COMMAND); + System.exit(-1); + } + + File gtDir = new File(args[0]); + File domainDir = new File(args[1]); + + System.out.println("DOMAIN\tID\tPRECISION\tRECALL\tF1"); + + try { + List> domains; + domains = new StrongDomainJsonReader().readAllBlocks(domainDir); + for (File file : gtDir.listFiles()) { + String name = file.getName().substring(0, file.getName().indexOf(".")); + IDSet gt = new GTReader().read(file); + BigDecimal[] bestMatch = new BigDecimal[]{ + BigDecimal.ZERO, + BigDecimal.ZERO, + BigDecimal.ZERO + }; + for (List domainBlocks : domains) { + HashIDSet domain = new HashIDSet(); + for (IDSet block : domainBlocks) { + domain.add(block); + int ovp = domain.overlap(gt); + if (ovp > 0) { + Precision precision = new Precision(ovp, domain.length()); + Recall recall = new Recall(ovp, gt.length()); + BigDecimal f1 = new F1(precision, recall).value(); + if (bestMatch[2].compareTo(f1) < 0) { + bestMatch = new BigDecimal[]{ + precision.value(), + recall.value(), + f1 + }; + } + } + } + } + System.out.println( + String.format( + "%s\t%s\t%s\t%s", + name, + new FormatedBigDecimal(bestMatch[0]), + new FormatedBigDecimal(bestMatch[1]), + new FormatedBigDecimal(bestMatch[2]) + ) + ); + } + } catch (java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "RUN", ex); + System.exit(-1); + } + } +} diff --git a/src/main/java/org/opendata/curation/d4/experiments/GTReader.java b/src/main/java/org/opendata/curation/d4/experiments/GTReader.java index ebb59fb..81c4792 100644 --- a/src/main/java/org/opendata/curation/d4/experiments/GTReader.java +++ b/src/main/java/org/opendata/curation/d4/experiments/GTReader.java @@ -24,7 +24,8 @@ import org.opendata.core.set.IDSet; /** - * + * Read set of term identifier for all terms in a ground-truth domain. + * * @author Heiko Mueller */ public class GTReader { diff --git a/src/main/java/org/opendata/curation/d4/experiments/SignatureComputationExperiment.java b/src/main/java/org/opendata/curation/d4/experiments/SignatureComputationExperiment.java new file mode 100644 index 0000000..be6d329 --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/experiments/SignatureComputationExperiment.java @@ -0,0 +1,96 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.experiments; + +import java.io.File; +import java.io.PrintWriter; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.opendata.core.io.FileSystem; +import org.opendata.curation.d4.signature.SignatureBlocks; +import org.opendata.curation.d4.signature.SignatureBlocksGenerator; +import org.opendata.curation.d4.signature.SignatureBlocksIndex; +import org.opendata.curation.d4.signature.trim.LiberalTrimmer; +import org.opendata.curation.d4.telemetry.TelemetrySet; +import org.opendata.db.column.Column; +import org.opendata.db.eq.EQIndex; + +/** + * Print execution time and signature sizes for context signature blocks + * generation for all columns in a database. + * + * @author Heiko Mueller + */ +public class SignatureComputationExperiment { + + private final static String COMMAND = + "Usage: "; + + private final static Logger LOGGER = Logger + .getLogger(SignatureComputationExperiment.class.getName()); + + public static void main(String[] args) { + + if (args.length != 3) { + System.out.println(COMMAND); + System.exit(-1); + } + + File eqFile = new File(args[0]); + int threads = Integer.parseInt(args[1]); + File outFile = new File(args[2]); + + try (PrintWriter out = FileSystem.openPrintWriter(outFile)) { + EQIndex eqIndex = new EQIndex(eqFile); + for (Column column : eqIndex.columns()) { + TelemetrySet telemetry = new TelemetrySet(); + SignatureBlocksIndex consumer = new SignatureBlocksIndex(); + new SignatureBlocksGenerator(telemetry) + .runWithMaxDrop( + eqIndex, + new ConcurrentLinkedQueue<>(column.toList()), + threads, + true, + new LiberalTrimmer(eqIndex.nodeSizes(), consumer) + ); + long execTime = telemetry.get(SignatureBlocksGenerator.TELEMETRY_ID); + int blockCount = 0; + int nodeCount = 0; + for (SignatureBlocks sig : consumer) { + blockCount += sig.size(); + for (int iBlock = 0; iBlock < sig.size(); iBlock++) { + nodeCount += sig.get(iBlock).length; + } + } + String line = String.format( + "%d\t%d\t%d\t%d\t%d", + column.id(), + execTime, + consumer.length(), + blockCount, + nodeCount + ); + out.println(line); + System.out.println(line); + } + } catch (java.lang.InterruptedException | java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "RUN", ex); + } + } +} diff --git a/src/main/java/org/opendata/curation/d4/experiments/StrongDomainJsonReader.java b/src/main/java/org/opendata/curation/d4/experiments/StrongDomainJsonReader.java index 5625039..fe71d71 100644 --- a/src/main/java/org/opendata/curation/d4/experiments/StrongDomainJsonReader.java +++ b/src/main/java/org/opendata/curation/d4/experiments/StrongDomainJsonReader.java @@ -24,6 +24,7 @@ import java.util.ArrayList; import java.util.List; import org.opendata.core.set.HashIDSet; +import org.opendata.core.set.IDSet; import org.opendata.core.set.MutableIdentifiableIDSet; /** @@ -60,6 +61,30 @@ public MutableIdentifiableIDSet read( } return new MutableIdentifiableIDSet(domainId, terms); } + + public List readBlocks(File file) throws java.io.IOException { + + JsonArray blocks; + blocks = new JsonParser() + .parse(new FileReader(file)) + .getAsJsonObject() + .get("terms") + .getAsJsonArray(); + + int domainId = Integer.parseInt(file.getName().substring(0, file.getName().indexOf("."))); + List terms = new ArrayList<>(); + + for (int iBlock = 0; iBlock < blocks.size(); iBlock++) { + JsonArray block = blocks.get(iBlock).getAsJsonArray(); + HashIDSet blockTerms = new HashIDSet(); + for (int iTerm = 0; iTerm < block.size(); iTerm++) { + int termId = block.get(iTerm).getAsJsonObject().get("id").getAsInt(); + blockTerms.add(termId); + } + terms.add(blockTerms); + } + return terms; + } public List readAll( File inputDir, @@ -76,4 +101,17 @@ public List readAll( return domains; } + + public List> readAllBlocks(File inputDir) throws java.io.IOException { + + List> domains = new ArrayList<>(); + + for (File file : inputDir.listFiles()) { + if (file.getName().endsWith(".json")) { + domains.add(this.readBlocks(file)); + } + } + + return domains; + } } diff --git a/src/main/java/org/opendata/curation/d4/explore/CosineSimContextSignaturePrinter.java b/src/main/java/org/opendata/curation/d4/explore/CosineSimContextSignaturePrinter.java new file mode 100644 index 0000000..2a4bfb8 --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/explore/CosineSimContextSignaturePrinter.java @@ -0,0 +1,190 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.explore; + +import org.opendata.curation.d4.signature.*; +import java.io.File; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.opendata.curation.d4.Arguments; +import org.opendata.curation.d4.Constants; +import org.opendata.core.constraint.GreaterThanConstraint; +import org.opendata.core.prune.MaxDropFinder; +import org.opendata.core.set.HashIDSet; +import org.opendata.core.set.IdentifiableObjectSet; +import org.opendata.core.sort.DoubleValueDescSort; +import org.opendata.db.eq.EQIndex; +import org.opendata.db.term.Term; +import org.opendata.db.term.TermIndexReader; + +/** + * Print context signature for node. + * + * @author Heiko Mueller + */ +public class CosineSimContextSignaturePrinter { + + private static double cosineSimilarity(double[] v1, double[] v2) { + + double sum = 0; + double a = 0; + double b = 0; + for (int i = 0; i < v1.length; i++) { + sum += (v1[i] * v2[i]); + a += (v1[i] * v1[i]); + b += (v2[i] * v2[i]); + } + return sum / (Math.sqrt(a) * Math.sqrt(b)); + } + + public void print( + EQIndex eqIndex, + TermIndexReader termReader, + boolean fullSignatureConstraint, + boolean ignoreLastDrop, + int nodeId + ) throws java.io.IOException { + + MaxDropFinder candidateFinder; + candidateFinder = new MaxDropFinder<>( + new GreaterThanConstraint(BigDecimal.ZERO), + fullSignatureConstraint, + ignoreLastDrop + ); + + ContextSignatureGenerator sigGen; + sigGen = new ContextSignatureGenerator(eqIndex.nodes()); + List sigElements = sigGen.getSignature(nodeId).elements(); + + double[] contextSig = new double[eqIndex.length()]; + for (SignatureValue val : sigElements) { + contextSig[val.id()] = val.value(); + } + + List sig = new ArrayList<>(); + for (SignatureValue val : sigElements) { + double[] elSig = new double[eqIndex.length()]; + for (SignatureValue v : sigGen.getSignature(val.id()).elements()) { + elSig[val.id()] = v.value(); + } + sig.add(new SignatureValue(val.id(), cosineSimilarity(contextSig, elSig))); + } + Collections.sort(sig, new DoubleValueDescSort<>()); + + HashIDSet nodeFilter = new HashIDSet(); + for (SignatureValue el : sig) { + nodeFilter.add(eqIndex.get(el.id()).terms()); + } + IdentifiableObjectSet termIndex = termReader.read(nodeFilter); + + int start = 0; + final int end = sig.size(); + int blockCount = 0; + while (start < end) { + int pruneIndex = candidateFinder.getPruneIndex(sig, start); + if (pruneIndex <= start) { + break; + } + blockCount++; + int nodeCount = pruneIndex - start; + int[] block = new int[nodeCount]; + int termCount = 0; + for (int iEl = start; iEl < pruneIndex; iEl++) { + SignatureValue el = sig.get(iEl); + block[iEl - start] = el.id(); + termCount += eqIndex.get(el.id()).terms().length(); + } + Arrays.sort(block); + String headline = "\n-- BLOCK " + blockCount + " (" + nodeCount + " NODES, " + termCount + " TERMS)"; + System.out.println(headline); + for (int iEl = start; iEl < pruneIndex; iEl++) { + SignatureValue el = sig.get(iEl); + boolean isFirst = true; + for (int termId : eqIndex.get(el.id()).terms()) { + String line; + if (isFirst) { + line = Integer.toString(el.id()); + isFirst = false; + } else { + line = ""; + } + line += "\t" + termIndex.get(termId).name() + "\t" + el.toPlainString(); + System.out.println(line); + } + } + start = pruneIndex; + } + } + + private static final String ARG_FULLSIG = "fullSigConstraint"; + private static final String ARG_LASTDROP = "ignoreLastDrop"; + + private static final String[] ARGS = { + ARG_FULLSIG, + ARG_LASTDROP + }; + + private static final String COMMAND = + "Usage\n" + + " --" + ARG_FULLSIG + "=[true | false] [default: false]\n" + + " --" + ARG_LASTDROP + "=[true | false] [default: true]\n" + + " \n" + + " \n" + + " "; + + private static final Logger LOGGER = Logger + .getLogger(CosineSimContextSignaturePrinter.class.getName()); + + public static void main(String[] args) { + + System.out.println(Constants.NAME + " - Context Signature Printer - Version (" + Constants.VERSION + ")\n"); + + if (args.length < 3) { + System.out.println(COMMAND); + System.exit(-1); + } + + Arguments params = new Arguments(ARGS, args, 3); + File eqFile = new File(params.fixedArg(0)); + File termFile = new File(params.fixedArg(1)); + int nodeId = Integer.parseInt(params.fixedArg(2)); + + boolean fullSignatureConstraint = params.getAsBool(ARG_FULLSIG, false); + boolean ignoreLastDrop = params.getAsBool(ARG_LASTDROP, true); + + try { + // Read the node index + EQIndex nodeIndex = new EQIndex(eqFile); + new CosineSimContextSignaturePrinter().print( + nodeIndex, + new TermIndexReader(termFile), + fullSignatureConstraint, + ignoreLastDrop, + nodeId + ); + } catch (java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "RUN", ex); + System.exit(-1); + } + } +} diff --git a/src/main/java/org/opendata/curation/d4/explore/GTDisjointTermsPrinter.java b/src/main/java/org/opendata/curation/d4/explore/GTDisjointTermsPrinter.java new file mode 100644 index 0000000..7ec0bdd --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/explore/GTDisjointTermsPrinter.java @@ -0,0 +1,100 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.explore; + +import java.io.BufferedReader; +import java.io.File; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.opendata.core.io.FileSystem; +import org.opendata.core.set.IdentifiableObjectSet; +import org.opendata.curation.d4.column.ExpandedColumn; +import org.opendata.curation.d4.column.ExpandedColumnReader; +import org.opendata.db.eq.EQ; +import org.opendata.db.eq.EQIndex; +import org.opendata.db.eq.LazyParseEQIndex; + +/** + * Print pairs of terms in a ground-truth domain that do not occur together + * in any column (but are added to a common column in expansion). + * + * @author Heiko Mueller + */ +public class GTDisjointTermsPrinter { + + public static void main(String[] args) { + + if (args.length != 3) { + System.out.println("Usage: "); + System.exit(-1); + } + + File eqFile = new File(args[0]); + File columnsFile = new File(args[1]); + File gtFile = new File(args[2]); + + try (BufferedReader in = FileSystem.openReader(gtFile)) { + EQIndex eqIndex = new LazyParseEQIndex(eqFile); + IdentifiableObjectSet columns; + columns = new ExpandedColumnReader(columnsFile).read(); + List nodes = new ArrayList<>(); + String line; + while ((line = in.readLine()) != null) { + String[] tokens = line.split("\t"); + nodes.add(eqIndex.get(Integer.parseInt(tokens[0]))); + } + for (int iNode = 0; iNode < nodes.size() - 1; iNode++) { + EQ nodeI = nodes.get(iNode); + for (int jNode = iNode + 1; jNode < nodes.size(); jNode++) { + EQ nodeJ = nodes.get(jNode); + if (!nodeI.columns().overlaps(nodeJ.columns())) { + for (int columnId : nodeI.columns()) { + if (columns.get(columnId).expandedNodes().contains(nodeJ.id())) { + System.out.println( + String.format( + "%d -> %d in %d", + nodeI.id(), + nodeJ.id(), + columnId + ) + ); + } + } + for (int columnId : nodeJ.columns()) { + if (columns.get(columnId).expandedNodes().contains(nodeI.id())) { + System.out.println( + String.format( + "%d -> %d in %d", + nodeJ.id(), + nodeI.id(), + columnId + ) + ); + } + } + } + } + } + } catch (java.io.IOException ex) { + Logger.getGlobal().log(Level.SEVERE, "RUN", ex); + System.exit(-1); + } + } +} diff --git a/src/main/java/org/opendata/curation/d4/signature/ContextSignaturePrinter.java b/src/main/java/org/opendata/curation/d4/signature/ContextSignaturePrinter.java index b4b2c8d..1f0488d 100644 --- a/src/main/java/org/opendata/curation/d4/signature/ContextSignaturePrinter.java +++ b/src/main/java/org/opendata/curation/d4/signature/ContextSignaturePrinter.java @@ -19,29 +19,22 @@ import java.io.File; import java.math.BigDecimal; -import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import org.opendata.curation.d4.Arguments; import org.opendata.curation.d4.Constants; -import org.opendata.curation.d4.signature.trim.CentristTrimmer; -import org.opendata.curation.d4.signature.trim.LiberalTrimmer; -import org.opendata.curation.d4.signature.trim.PrecisionScore; import org.opendata.core.constraint.GreaterThanConstraint; -import org.opendata.core.object.IdentifiableDouble; import org.opendata.core.prune.MaxDropFinder; import org.opendata.core.set.HashIDSet; import org.opendata.core.set.IdentifiableObjectSet; -import org.opendata.db.column.Column; import org.opendata.db.eq.EQIndex; import org.opendata.db.term.Term; import org.opendata.db.term.TermIndexReader; /** - * Print context signature for node. Allows to include scores of blocks for a - * given column. + * Print context signature for node. * * @author Heiko Mueller */ @@ -52,7 +45,6 @@ public void print( TermIndexReader termReader, boolean fullSignatureConstraint, boolean ignoreLastDrop, - Column column, int nodeId ) throws java.io.IOException { @@ -72,42 +64,11 @@ public void print( for (SignatureValue el : sig) { nodeFilter.add(eqIndex.get(el.id()).terms()); } - if (column != null) { - for (int n : column) { - nodeFilter.add(eqIndex.get(n).terms()); - } - } IdentifiableObjectSet termIndex = termReader.read(nodeFilter); - if (column != null) { - System.out.println("COLUMN"); - for (int n : column) { - for (int termId : eqIndex.get(n).terms()) { - String value = termIndex.get(termId).name(); - System.out.println(n + "\t" + termId + "\t" + value); - } - } - System.out.println(); - } - - int[] columnNodes = null; - int columnSize = -1; - int[] nodeSizes = null; - List scores = null; - if (column != null) { - columnNodes = column.toArray(); - nodeSizes = eqIndex.nodeSizes(); - for (int n : column) { - columnSize += eqIndex.get(n).terms().length(); - } - scores = new ArrayList<>(); - } - - PrecisionScore scoreFunc = new PrecisionScore(eqIndex); int start = 0; final int end = sig.size(); int blockCount = 0; - ArrayList blocks = new ArrayList<>(); while (start < end) { int pruneIndex = candidateFinder.getPruneIndex(sig, start); if (pruneIndex <= start) { @@ -122,16 +83,8 @@ public void print( block[iEl - start] = el.id(); termCount += eqIndex.get(el.id()).terms().length(); } - blocks.add(block); Arrays.sort(block); - if (column != null ) { - BigDecimal score = scoreFunc.score(block, column.id()); - scores.add(new IdentifiableDouble(blockCount, score)); - } String headline = "\n-- BLOCK " + blockCount + " (" + nodeCount + " NODES, " + termCount + " TERMS)"; - if (column != null) { - headline += " SCORE " + scores.get(scores.size() - 1).toPlainString() + "\n"; - } System.out.println(headline); for (int iEl = start; iEl < pruneIndex; iEl++) { SignatureValue el = sig.get(iEl); @@ -149,42 +102,13 @@ public void print( } } start = pruneIndex; - } - - if (column != null) { - RobustSignatureIndex buffer = new RobustSignatureIndex(); - new LiberalTrimmer( - nodeSizes, - new CentristTrimmer(column, new PrecisionScore(eqIndex, eqIndex.columns()), buffer) - ).consume(new SignatureBlocksImpl(nodeId, BigDecimal.ONE, blocks)); - System.out.println("\nSIGNATURE BLOCKS FOR COLUMN " + column.id() + "\n"); - SignatureBlocks sigBlocks = buffer.get(nodeId); - for (int iBlock = 0; iBlock < sigBlocks.size(); iBlock++) { - for (int n : sigBlocks.get(iBlock)) { - boolean isFirst = true; - for (int termId : eqIndex.get(n).terms()) { - String line; - if (isFirst) { - line = Integer.toString(n); - isFirst = false; - } else { - line = ""; - } - line += "\t" + termIndex.get(termId).name(); - System.out.println(line); - } - } - System.out.println(); - } - } + } } - private static final String ARG_COLUMN = "column"; private static final String ARG_FULLSIG = "fullSigConstraint"; private static final String ARG_LASTDROP = "ignoreLastDrop"; private static final String[] ARGS = { - ARG_COLUMN, ARG_FULLSIG, ARG_LASTDROP }; @@ -193,7 +117,6 @@ public void print( "Usage\n" + " --" + ARG_FULLSIG + "=[true | false] [default: false]\n" + " --" + ARG_LASTDROP + "=[true | false] [default: true]\n" + - " --" + ARG_COLUMN + "= (default: none)\n" + " \n" + " \n" + " "; @@ -217,24 +140,15 @@ public static void main(String[] args) { boolean fullSignatureConstraint = params.getAsBool(ARG_FULLSIG, false); boolean ignoreLastDrop = params.getAsBool(ARG_LASTDROP, true); - int columnId = -1; - if (params.has(ARG_COLUMN)) { - columnId = params.getAsInt(ARG_COLUMN); - } try { // Read the node index EQIndex nodeIndex = new EQIndex(eqFile); - Column column = null; - if (columnId > -1) { - column = nodeIndex.columns().get(columnId); - } new ContextSignaturePrinter().print( nodeIndex, new TermIndexReader(termFile), fullSignatureConstraint, ignoreLastDrop, - column, nodeId ); } catch (java.io.IOException ex) { diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java index 64b185f..4c65b78 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java @@ -133,7 +133,7 @@ private void compute( CandidateSetFinder candidateFinder, int threads, boolean verbose, - SignatureBlocksConsumerFactory consumerFactory + SignatureBlocksConsumer consumer ) throws java.lang.InterruptedException, java.io.IOException { if (verbose) { @@ -152,6 +152,8 @@ private void compute( System.out.println("START @ " + start); } + consumer.open(); + ExecutorService es = Executors.newCachedThreadPool(); for (int iThread = 0; iThread < threads; iThread++) { es.execute( @@ -159,7 +161,7 @@ private void compute( queue, sigFact, candidateFinder, - consumerFactory.getConsumer() + consumer ) ); } @@ -170,7 +172,7 @@ private void compute( throw new RuntimeException(ex); } - consumerFactory.close(); + consumer.close(); Date end = new Date(); if (verbose) { @@ -190,7 +192,7 @@ private void compute( * @param threshold * @param threads * @param verbose - * @param consumerFactory + * @param consumer * @throws java.lang.InterruptedException * @throws java.io.IOException */ @@ -200,7 +202,7 @@ public void runWithThreshold( Threshold threshold, int threads, boolean verbose, - SignatureBlocksConsumerFactory consumerFactory + SignatureBlocksConsumer consumer ) throws java.lang.InterruptedException, java.io.IOException { ThresholdFinder candidateFinder; @@ -212,7 +214,7 @@ public void runWithThreshold( candidateFinder, threads, verbose, - consumerFactory + consumer ); } @@ -225,7 +227,7 @@ public void runWithThreshold( * @param ignoreLastDrop * @param threads * @param verbose - * @param consumerFactory + * @param consumer * @throws java.lang.InterruptedException * @throws java.io.IOException */ @@ -236,7 +238,7 @@ public void runWithMaxDrop( boolean ignoreLastDrop, int threads, boolean verbose, - SignatureBlocksConsumerFactory consumerFactory + SignatureBlocksConsumer consumer ) throws java.lang.InterruptedException, java.io.IOException { MaxDropFinder candidateFinder; @@ -252,7 +254,43 @@ public void runWithMaxDrop( candidateFinder, threads, verbose, - consumerFactory + consumer + ); + } + + /** + * Generate signature blocks using consecutive steepest drops. + * + * @param eqIndex + * @param queue + * @param threads + * @param verbose + * @param consumer + * @throws java.lang.InterruptedException + * @throws java.io.IOException + */ + public void runWithMaxDrop( + EQIndex eqIndex, + ConcurrentLinkedQueue queue, + int threads, + boolean verbose, + SignatureBlocksConsumer consumer + ) throws java.lang.InterruptedException, java.io.IOException { + + MaxDropFinder candidateFinder; + candidateFinder = new MaxDropFinder<>( + new GreaterThanConstraint(BigDecimal.ZERO), + false, + true + ); + + this.compute( + new ContextSignatureGenerator(eqIndex.nodes()), + queue, + candidateFinder, + threads, + verbose, + consumer ); } } diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndexFactory.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndexFactory.java deleted file mode 100644 index a66b63c..0000000 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndexFactory.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * This file is part of the Data-Driven Domain Discovery Tool (D4). - * - * Copyright (c) 2018-2020 New York University. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.opendata.curation.d4.signature; - -import org.opendata.curation.d4.signature.trim.LiberalTrimmer; - -/** - * Signature blocks factory for single in-memory index. - * - * @author Heiko Mueller - */ -public class SignatureBlocksIndexFactory implements SignatureBlocksConsumerFactory { - - private SignatureBlocksConsumer _consumer = null; - private final int[] _nodeSizes; - private SignatureBlocksIndex _signatures = null; - - public SignatureBlocksIndexFactory(int[] nodeSizes) { - - _nodeSizes = nodeSizes; - } - - @Override - public void close() { - - } - - @Override - public SignatureBlocksConsumer getConsumer() { - - if (_consumer == null) { - _signatures = new SignatureBlocksIndex(); - _consumer = new LiberalTrimmer(_nodeSizes, _signatures); - } - return _consumer; - } - - @Override - public SignatureBlocksIndex signatures() { - - return _signatures; - } -} diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStats.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStats.java index 5cc9849..cbd559e 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStats.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStats.java @@ -71,11 +71,16 @@ public int max() { return _max; } + + public long sum() { + + return _sum; + } } private StatsCollector _blockStats; private SimilarityHistogram _histogram = null; - private StatsCollector _nodeStats; + private StatsCollector[] _nodeStats; @Override public void close() { @@ -88,13 +93,18 @@ public void consume(SignatureBlocks sig) { _blockStats.add(sig.size()); _histogram.add(sig.maxSim()); - int nodeCount = 0; + int[] nodeCount = new int[11]; for (int iBlock = 0; iBlock < sig.size(); iBlock++) { - nodeCount += sig.get(iBlock).length; + int bl = sig.get(iBlock).length; + for (int i = 0; i < 10; i++) { + nodeCount[i] += Math.min(bl, (i + 1) * 10); + } + nodeCount[10] += bl; } - _nodeStats.add(nodeCount); - + for (int i = 0; i < nodeCount.length; i++) { + _nodeStats[i].add(nodeCount[i]); + } } @Override @@ -102,7 +112,10 @@ public void open() { _blockStats = new StatsCollector(); _histogram = new SimilarityHistogram(); - _nodeStats = new StatsCollector(); + _nodeStats = new StatsCollector[11]; + for (int i = 0; i < _nodeStats.length; i++) { + _nodeStats[i] = new StatsCollector(); + } } public void print(PrintWriter out) { @@ -114,25 +127,24 @@ public void print(PrintWriter out) { out.println("MAX. SIZE : " + _blockStats.max()); out.println("AVG. SIZE : " + _blockStats.avg()); out.println(); - out.println("NODE COUNTS"); - out.println("MIN. SIZE : " + _nodeStats.min()); - out.println("MAX. SIZE : " + _nodeStats.max()); - out.println("AVG. SIZE : " + _nodeStats.avg()); + for (int i = 0; i < _nodeStats.length - 1; i++) { + out.println("NODE COUNTS " + ((i + 1) * 10)); + out.println("MIN. SIZE : " + _nodeStats[i].min()); + out.println("MAX. SIZE : " + _nodeStats[i].max()); + out.println("AVG. SIZE : " + _nodeStats[i].avg()); + out.println("SUM : " + _nodeStats[i].sum()); + } + out.println("NODE COUNTS (TOTAL)"); + out.println("MIN. SIZE : " + _nodeStats[10].min()); + out.println("MAX. SIZE : " + _nodeStats[10].max()); + out.println("AVG. SIZE : " + _nodeStats[10].avg()); + out.println("SUM : " + _nodeStats[10].sum()); + out.flush(); } public void print() { - System.out.println("SIGNATURE COUNT: " + _blockStats.count()); - System.out.println(); - System.out.println("SIGNATURE BLOCKS"); - System.out.println("MIN. SIZE : " + _blockStats.min()); - System.out.println("MAX. SIZE : " + _blockStats.max()); - System.out.println("AVG. SIZE : " + _blockStats.avg()); - System.out.println(); - System.out.println("NODE COUNTS"); - System.out.println("MIN. SIZE : " + _nodeStats.min()); - System.out.println("MAX. SIZE : " + _nodeStats.max()); - System.out.println("AVG. SIZE : " + _nodeStats.avg()); + this.print(new PrintWriter(System.out)); } private static final String COMMAND = diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriterFactory.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriterFactory.java deleted file mode 100644 index dddaf1a..0000000 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriterFactory.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * This file is part of the Data-Driven Domain Discovery Tool (D4). - * - * Copyright (c) 2018-2020 New York University. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.opendata.curation.d4.signature; - -import java.io.File; -import java.util.ArrayList; -import java.util.List; -import org.opendata.core.io.FileSystem; -import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; - -/** - * Signature blocks writer factory. Returns consumer that are open. Since this - * factory returns the same consumer multiple times if writing to a single file - * the calling method should not close the consumer but call the close method - * of this class instead at the end of processing. - * - * @author Heiko Mueller - */ -public class SignatureBlocksWriterFactory implements SignatureBlocksConsumerFactory { - - private int _count = 0; - private final File _file; - private SignatureBlocksConsumer _globalConsumer = null; - private final SignatureTrimmerFactory _trimmerFactory; - private final boolean _outputToDir; - private List _openConsumer = null; - - public SignatureBlocksWriterFactory( - File file, - SignatureTrimmerFactory trimmerFactory, - boolean outputToDir - ) { - - _file = file; - _trimmerFactory = trimmerFactory; - _outputToDir = outputToDir; - - _openConsumer = new ArrayList<>(); - - if (outputToDir) { - FileSystem.createFolder(file); - } else { - FileSystem.createParentFolder(file); - } - } - - @Override - public void close() { - - for (SignatureBlocksConsumer consumer : _openConsumer) { - consumer.close(); - } - _globalConsumer = null; - _openConsumer = null; - } - - @Override - public SignatureBlocksConsumer getConsumer() { - - if (_outputToDir) { - String filename = "signature-blocks." + (_count++) + ".txt.gz"; - File outputFile = FileSystem.joinPath(_file, filename); - SignatureBlocksConsumer trimmer = _trimmerFactory.getTrimmer( - new SignatureBlocksWriter(outputFile) - ); - trimmer.open(); - _openConsumer.add(trimmer); - return trimmer; - } else { - if (_globalConsumer == null) { - _globalConsumer = _trimmerFactory.getTrimmer( - new SignatureBlocksWriter(_file) - ); - _globalConsumer.open(); - _openConsumer.add(_globalConsumer); - } - return _globalConsumer; - } - } - - @Override - public SignatureBlocksIndex signatures() throws java.io.IOException { - - SignatureBlocksIndex buffer = new SignatureBlocksIndex(); - new SignatureBlocksReader(_file).stream(_globalConsumer); - return buffer; - } -} diff --git a/src/main/java/org/opendata/db/eq/DefaultEQFactory.java b/src/main/java/org/opendata/db/eq/DefaultEQFactory.java new file mode 100644 index 0000000..b339de3 --- /dev/null +++ b/src/main/java/org/opendata/db/eq/DefaultEQFactory.java @@ -0,0 +1,33 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.db.eq; + +/** + * Default factory implementation for creating equivalence class objects from + * string representations. + * + * @author Heiko Mueller + */ +public class DefaultEQFactory implements EQFactory { + + @Override + public EQ parse(String text) { + + return new EQImpl(text.split("\t")); + } +} diff --git a/src/main/java/org/opendata/db/eq/EQFactory.java b/src/main/java/org/opendata/db/eq/EQFactory.java new file mode 100644 index 0000000..d397417 --- /dev/null +++ b/src/main/java/org/opendata/db/eq/EQFactory.java @@ -0,0 +1,29 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.db.eq; + +/** + * Factory pattern for creating equivalence class objects from string + * representations. + * + * @author Heiko Mueller + */ +public interface EQFactory { + + public EQ parse(String text); +} diff --git a/src/main/java/org/opendata/db/eq/EQImpl.java b/src/main/java/org/opendata/db/eq/EQImpl.java index 0fbc306..9d08b21 100644 --- a/src/main/java/org/opendata/db/eq/EQImpl.java +++ b/src/main/java/org/opendata/db/eq/EQImpl.java @@ -57,7 +57,7 @@ public IDSet columns() { return _columns; } - private static IDSet parseColumnList(String list) { + public static IDSet parseColumnList(String list) { HashIDSet columns = new HashIDSet(); diff --git a/src/main/java/org/opendata/db/eq/EQIndex.java b/src/main/java/org/opendata/db/eq/EQIndex.java index 11c480c..9b990af 100644 --- a/src/main/java/org/opendata/db/eq/EQIndex.java +++ b/src/main/java/org/opendata/db/eq/EQIndex.java @@ -37,6 +37,11 @@ public class EQIndex extends HashObjectSet implements EQStream, SizeFunction private int[] _nodeSizes = null; + public EQIndex(File eqFile, EQFactory factory) throws java.io.IOException { + + super(new EQReader(eqFile, factory).read()); + } + public EQIndex(File eqFile) throws java.io.IOException { super(new EQReader(eqFile).read()); diff --git a/src/main/java/org/opendata/db/eq/EQReader.java b/src/main/java/org/opendata/db/eq/EQReader.java index fca5f65..177bd14 100644 --- a/src/main/java/org/opendata/db/eq/EQReader.java +++ b/src/main/java/org/opendata/db/eq/EQReader.java @@ -30,11 +30,18 @@ */ public class EQReader implements EQStream { + private EQFactory _factory; private final File _file; - public EQReader(File file) { + public EQReader(File file, EQFactory factory) { _file = file; + _factory = factory; + } + + public EQReader(File file) { + + this(file, new DefaultEQFactory()); } public IdentifiableObjectSet read() throws java.io.IOException { @@ -46,7 +53,7 @@ public IdentifiableObjectSet read() throws java.io.IOException { while ((line = in.readLine()) != null) { line = line.trim(); if (!line.equals("")) { - result.add(new EQImpl(line.split("\t"))); + result.add(_factory.parse(line)); } } } diff --git a/src/main/java/org/opendata/db/eq/LazyParseEQ.java b/src/main/java/org/opendata/db/eq/LazyParseEQ.java new file mode 100644 index 0000000..f7dd868 --- /dev/null +++ b/src/main/java/org/opendata/db/eq/LazyParseEQ.java @@ -0,0 +1,69 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.db.eq; + +import java.io.PrintWriter; +import org.opendata.core.object.IdentifiableObjectImpl; +import org.opendata.core.set.IDSet; +import org.opendata.core.set.ImmutableIDSet; + +/** + * Alternative implementation for equivalence classes. Delays parsing the list + * of columns and terms until their are first accessed.. + * + * @author Heiko Mueller + */ +public class LazyParseEQ extends IdentifiableObjectImpl implements EQ { + + private final String _columns; + private IDSet _columnsList = null; + private final String _terms; + private IDSet _termsList = null; + + public LazyParseEQ(int id, String terms, String columns) { + + super(id); + + _terms = terms; + _columns = columns; + } + + @Override + public IDSet columns() { + + if (_columnsList == null) { + _columnsList = EQImpl.parseColumnList(_columns); + } + return _columnsList; + } + + @Override + public IDSet terms() { + + if (_termsList == null) { + _termsList = new ImmutableIDSet(_terms); + } + return _termsList; + } + + @Override + public void write(PrintWriter out) { + + out.println(String.format("%d\t%s\t%s", this.id(), _terms, _columns)); + } +} diff --git a/src/main/java/org/opendata/db/eq/LazyParseEQFactory.java b/src/main/java/org/opendata/db/eq/LazyParseEQFactory.java new file mode 100644 index 0000000..b275ca6 --- /dev/null +++ b/src/main/java/org/opendata/db/eq/LazyParseEQFactory.java @@ -0,0 +1,34 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.db.eq; + +/** + * Factory for equivalence classes that parse column and term lists in a lazy + * fashion. + * + * @author Heiko Mueller + */ +public class LazyParseEQFactory implements EQFactory { + + @Override + public EQ parse(String text) { + + String[] tokens = text.split("\t"); + return new LazyParseEQ(Integer.parseInt(tokens[0]), tokens[1], tokens[2]); + } +} diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumerFactory.java b/src/main/java/org/opendata/db/eq/LazyParseEQIndex.java similarity index 72% rename from src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumerFactory.java rename to src/main/java/org/opendata/db/eq/LazyParseEQIndex.java index 3f84dce..11495e7 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumerFactory.java +++ b/src/main/java/org/opendata/db/eq/LazyParseEQIndex.java @@ -15,17 +15,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.opendata.curation.d4.signature; +package org.opendata.db.eq; + +import java.io.File; /** - * Factory pattern for signature blocks consumers. + * Equivalence class index for lazy parse classes. * * @author Heiko Mueller */ -public interface SignatureBlocksConsumerFactory { +public class LazyParseEQIndex extends EQIndex { - public void close(); - public SignatureBlocksConsumer getConsumer(); - public SignatureBlocksIndex signatures() throws java.io.IOException; - + public LazyParseEQIndex(File eqFile) throws java.io.IOException { + + super(eqFile, new LazyParseEQFactory()); + } } diff --git a/src/main/java/org/opendata/db/eq/SimilarTermIndexGenerator.java b/src/main/java/org/opendata/db/eq/SimilarTermIndexGenerator.java new file mode 100644 index 0000000..ce135bb --- /dev/null +++ b/src/main/java/org/opendata/db/eq/SimilarTermIndexGenerator.java @@ -0,0 +1,153 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.db.eq; + +import java.io.File; +import java.io.PrintWriter; +import java.math.BigDecimal; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.opendata.core.constraint.Threshold; +import org.opendata.core.graph.UndirectedConnectedComponents; +import org.opendata.core.io.FileSystem; +import org.opendata.core.metric.JaccardIndex; +import org.opendata.core.set.HashIDSet; +import org.opendata.core.set.IdentifiableIDSet; +import org.opendata.core.sort.IdentifiableObjectSort; + +/** + * Generate a compressed term index where similar equivalence classes are + * merged. + * + * @author Heiko Mueller + */ +public class SimilarTermIndexGenerator { + + private class OverlapComputer implements Runnable { + + private final UndirectedConnectedComponents _graph; + private final ConcurrentLinkedQueue _queue; + private final List _nodes; + private final Threshold _threshold; + + public OverlapComputer( + ConcurrentLinkedQueue queue, + List nodes, + Threshold threshold, + UndirectedConnectedComponents graph + ) { + _queue = queue; + _nodes = nodes; + _threshold = threshold; + _graph = graph; + } + + @Override + public void run() { + + Node nodeI; + while ((nodeI = _queue.poll()) != null) { + for (int jNode = 0; jNode < _nodes.size(); jNode++) { + Node nodeJ = _nodes.get(jNode); + if (nodeJ.id() < nodeI.id()) { + int overlap = nodeI.overlap(nodeJ); + if (overlap > 0) { + BigDecimal sim = new JaccardIndex() + .sim(nodeI.columnCount(), nodeJ.columnCount(), overlap); + if (_threshold.isSatisfied(sim)) { + _graph.edge(nodeI.id(), nodeJ.id()); + } + } + } else { + break; + } + } + } + } + } + + public void run(EQIndex eqIndex, Threshold threshold, int threads, PrintWriter out) { + + UndirectedConnectedComponents graph; + graph = new UndirectedConnectedComponents(eqIndex.nodes().keys()); + + List nodes = eqIndex.nodes().toList(); + ConcurrentLinkedQueue queue = new ConcurrentLinkedQueue<>(nodes); + + Collections.sort(nodes, new IdentifiableObjectSort<>()); + + ExecutorService es = Executors.newCachedThreadPool(); + for (int iThread = 0; iThread < threads; iThread++) { + es.execute(new OverlapComputer(queue, nodes, threshold, graph)); + } + es.shutdown(); + try { + es.awaitTermination(threads, TimeUnit.DAYS); + } catch (java.lang.InterruptedException ex) { + throw new RuntimeException(ex); + } + + for (IdentifiableIDSet comp : graph.getComponents()) { + if (comp.length() == 1) { + eqIndex.get(comp.first()).write(out); + } else { + HashIDSet columns = new HashIDSet(); + HashIDSet terms = new HashIDSet(); + for (int nodeId : comp) { + EQ node = eqIndex.get(nodeId); + columns.add(node.columns()); + terms.add(node.terms()); + } + new EQImpl(comp.id(), terms, columns).write(out); + } + } + } + + private final static String COMMAND = + "Usage:\n \n \n \n "; + + private final static Logger LOGGER = Logger + .getLogger(SimilarTermIndexGenerator.class.getName()); + + public static void main(String[] args) { + + if (args.length != 4) { + System.out.println(COMMAND); + System.exit(-1); + } + + File eqFile = new File(args[0]); + Threshold threshold = Threshold.getConstraint(args[1]); + int threads = Integer.parseInt(args[2]); + File outFile = new File(args[3]); + + try (PrintWriter out = FileSystem.openPrintWriter(outFile)) { + new SimilarTermIndexGenerator() + .run(new EQIndex(eqFile), threshold, threads, out); + } catch (java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "RUN", ex); + System.exit(-1); + } + } +} From 99a541dd0ec0c302bcbe60d0d969e4d01a9ede6e Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Sat, 28 Nov 2020 11:21:07 -0500 Subject: [PATCH 13/25] Add trimmer option for original nodes only --- .../graph/UndirectedConnectedComponents.java | 48 ++--- ...Set.java => IdentifiableIDSetWrapper.java} | 6 +- .../org/opendata/curation/d4/Constants.java | 2 +- .../java/org/opendata/curation/d4/D4.java | 9 +- .../d4/column/ExpandedColumnIndex.java | 16 +- .../d4/column/ParallelColumnExpander.java | 13 +- .../opendata/curation/d4/domain/Domain.java | 4 +- .../domain/MultiScanLocalDomainGenerator.java | 7 +- .../SingleScanLocalDomainGenerator.java | 10 +- .../d4/domain/UndirectedDomainGenerator.java | 3 + .../SignatureComputationExperiment.java | 96 ---------- .../d4/prov/CentristBlockUsageWriter.java | 181 ------------------ .../d4/signature/trim/BlockScoreFunction.java | 6 +- .../trim/CentristBlockRelevanceFilter.java | 4 +- .../d4/signature/trim/MinJIScore.java | 4 +- .../d4/signature/trim/PrecisionScore.java | 9 +- .../trim/SignatureTrimmerFactory.java | 5 +- src/main/java/org/opendata/db/eq/EQIndex.java | 9 +- 18 files changed, 86 insertions(+), 346 deletions(-) rename src/main/java/org/opendata/core/set/{SimpleIdentifiableIDSet.java => IdentifiableIDSetWrapper.java} (95%) delete mode 100644 src/main/java/org/opendata/curation/d4/experiments/SignatureComputationExperiment.java delete mode 100644 src/main/java/org/opendata/curation/d4/prov/CentristBlockUsageWriter.java diff --git a/src/main/java/org/opendata/core/graph/UndirectedConnectedComponents.java b/src/main/java/org/opendata/core/graph/UndirectedConnectedComponents.java index f283bf3..ddb12c6 100644 --- a/src/main/java/org/opendata/core/graph/UndirectedConnectedComponents.java +++ b/src/main/java/org/opendata/core/graph/UndirectedConnectedComponents.java @@ -76,33 +76,10 @@ public synchronized boolean contains(int nodeId) { } } - /** - * Get the connected component that contains the node with the given - * identifier. - * - * @param nodeId - * @return - */ - private int getComponentForNode(int nodeId) { - - //if (!_nodes.contains(nodeId)) { - // throw new RuntimeException("Unknown node identifier: " + nodeId); - //} - - // If the nodeId is not contained in the component map then the node - // is in the component that has the same identifier as the nodeId - //if (_componentMap.containsKey(nodeId)) { - // return _componentMap.get(nodeId); - //} else { - // return nodeId; - //} - return _componentMap[nodeId]; - } - public synchronized void edge(int sourceId, int targetId) { - int sourceCompId = this.getComponentForNode(sourceId); - int targetCompId = this.getComponentForNode(targetId); + int sourceCompId = _componentMap[sourceId]; + int targetCompId = _componentMap[targetId]; if (sourceCompId != targetCompId) { // The respective components may not have been instantiated yet. @@ -165,6 +142,20 @@ public synchronized IdentifiableObjectSet getComponents() { return result; } + + /** + * Test if all nodes belong to the same single component. + * + * @return + */ + public boolean isComplete() { + + if (_components.size() == 1) { + int compSize = _components.values().iterator().next().size(); + return (compSize == _nodes.length()); + } + return false; + } private void merge( HashSet target, @@ -180,11 +171,4 @@ private void merge( _components.remove(sourceCompId); } - - public synchronized boolean nodesAreInSameComponent(int node1, int node2) { - - int comp1 = this.getComponentForNode(node1); - int comp2 = this.getComponentForNode(node2); - return (comp1 == comp2); - } } diff --git a/src/main/java/org/opendata/core/set/SimpleIdentifiableIDSet.java b/src/main/java/org/opendata/core/set/IdentifiableIDSetWrapper.java similarity index 95% rename from src/main/java/org/opendata/core/set/SimpleIdentifiableIDSet.java rename to src/main/java/org/opendata/core/set/IdentifiableIDSetWrapper.java index d128b98..99909b3 100644 --- a/src/main/java/org/opendata/core/set/SimpleIdentifiableIDSet.java +++ b/src/main/java/org/opendata/core/set/IdentifiableIDSetWrapper.java @@ -30,18 +30,18 @@ * * @author Heiko Mueller */ -public class SimpleIdentifiableIDSet extends IdentifiableObjectImpl implements IdentifiableIDSet { +public class IdentifiableIDSetWrapper extends IdentifiableObjectImpl implements IdentifiableIDSet { private final IDSet _values; - public SimpleIdentifiableIDSet(int id) { + public IdentifiableIDSetWrapper(int id) { super(id); _values = new ImmutableIDSet(); } - public SimpleIdentifiableIDSet(int id, IDSet values) { + public IdentifiableIDSetWrapper(int id, IDSet values) { super(id); diff --git a/src/main/java/org/opendata/curation/d4/Constants.java b/src/main/java/org/opendata/curation/d4/Constants.java index ecf8907..e00ecc4 100644 --- a/src/main/java/org/opendata/curation/d4/Constants.java +++ b/src/main/java/org/opendata/curation/d4/Constants.java @@ -26,5 +26,5 @@ public final class Constants { public static final String NAME = "D4 - Data-Driven Domain Discovery"; - public static final String VERSION = "0.29.0.dev7"; + public static final String VERSION = "0.29.0.dev8"; } \ No newline at end of file diff --git a/src/main/java/org/opendata/curation/d4/D4.java b/src/main/java/org/opendata/curation/d4/D4.java index f00e2d4..4905e24 100644 --- a/src/main/java/org/opendata/curation/d4/D4.java +++ b/src/main/java/org/opendata/curation/d4/D4.java @@ -54,6 +54,7 @@ import org.opendata.curation.d4.signature.SignatureBlocksWriter; import org.opendata.curation.d4.signature.trim.SignatureTrimmer; import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; +import org.opendata.db.Database; import org.opendata.db.column.Column; import org.opendata.db.eq.CompressedTermIndexGenerator; import org.opendata.db.eq.EQIndex; @@ -81,7 +82,7 @@ public void expandColumns( File outputFile ) throws java.io.IOException { - IdentifiableObjectSet db = nodeIndex.columns(); + IdentifiableObjectSet db = new Database(nodeIndex).columns(); new ParallelColumnExpander(telemetry).run( nodeIndex, signatures, @@ -127,6 +128,7 @@ public void localDomains( File columnsFile, SignatureBlocksReader signatures, String trimmer, + boolean originalOnly, int threads, boolean singleScan, boolean verbose, @@ -142,6 +144,7 @@ public void localDomains( columnIndex, signatures, trimmer, + originalOnly, threads, verbose, new DomainWriter(outputFile) @@ -152,6 +155,7 @@ public void localDomains( columnIndex, signatures.read(), trimmer, + originalOnly, threads, verbose, new DomainWriter(outputFile) @@ -479,6 +483,7 @@ public static void main(String[] args) { "trimmer", " [default: " + SignatureTrimmer.CENTRIST + "]" ), + new Parameter("originalonly", " [default: false]"), new Parameter("threads", " [default: 6]"), new Parameter("singlescan", " [default: false]"), new Parameter("verbose", " [default: true]"), @@ -493,6 +498,7 @@ public static void main(String[] args) { File columnsFile = params.getAsFile("columns", "expanded-columns.txt.gz"); File signatureFile = params.getAsFile("signatures", "signatures.txt.gz"); String trimmer = params.getAsString("trimmer", SignatureTrimmer.CENTRIST); + boolean originalOnly = params.getAsBool("originalonly", false); int threads = params.getAsInt("threads", 6); boolean singleScan = params.getAsBool("singlescan", false); boolean verbose = params.getAsBool("verbose", true); @@ -503,6 +509,7 @@ public static void main(String[] args) { columnsFile, new SignatureBlocksReader(signatureFile), trimmer, + originalOnly, threads, singleScan, verbose, diff --git a/src/main/java/org/opendata/curation/d4/column/ExpandedColumnIndex.java b/src/main/java/org/opendata/curation/d4/column/ExpandedColumnIndex.java index 232952c..a6b8da3 100644 --- a/src/main/java/org/opendata/curation/d4/column/ExpandedColumnIndex.java +++ b/src/main/java/org/opendata/curation/d4/column/ExpandedColumnIndex.java @@ -23,8 +23,9 @@ import org.opendata.core.set.HashIDSet; import org.opendata.core.set.HashObjectSet; import org.opendata.core.set.IDSet; +import org.opendata.core.set.IdentifiableIDSet; +import org.opendata.core.set.IdentifiableIDSetWrapper; import org.opendata.core.set.IdentifiableObjectSet; -import org.opendata.db.column.Column; /** * Create an unique index of expanded columns. Two expanded columns are @@ -86,14 +87,19 @@ public void open() { _columnMapping = new HashMap<>(); } - public IdentifiableObjectSet toColumns() { + public IdentifiableObjectSet toColumns(boolean originalOnly) { - HashObjectSet result = new HashObjectSet<>(); + HashObjectSet result = new HashObjectSet<>(); for (ExpandedColumn column : _columnList) { - IDSet nodes = column.nodes(); + IDSet nodes; + if (originalOnly) { + nodes = column.originalNodes(); + } else { + nodes = column.nodes(); + } for (int columnId : this.columns(column.id())) { - result.add(new Column(columnId, nodes)); + result.add(new IdentifiableIDSetWrapper(columnId, nodes)); } } diff --git a/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java b/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java index 9ff9f81..593bd87 100644 --- a/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java +++ b/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java @@ -35,7 +35,9 @@ import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; import org.opendata.core.constraint.Threshold; import org.opendata.core.set.HashIDSet; +import org.opendata.core.set.HashObjectSet; import org.opendata.core.set.IDSet; +import org.opendata.core.set.IdentifiableIDSet; import org.opendata.core.set.IdentifiableObjectSet; import org.opendata.core.set.MutableIdentifiableIDSet; import org.opendata.core.util.MemUsagePrinter; @@ -255,18 +257,21 @@ public void run( new MemUsagePrinter().print(); } + SignatureTrimmerFactory trimmerFactory; + trimmerFactory = new SignatureTrimmerFactory(nodes, nodes.columns(), trimmer); + ExecutorService es = Executors.newCachedThreadPool(); for (int iThread = 0; iThread < threads; iThread++) { - List columns = new ArrayList<>(); + List taskColumns = new ArrayList<>(); for (int iCol = iThread; iCol < columnList.size(); iCol += threads) { - columns.add(columnList.get(iCol)); + taskColumns.add(columnList.get(iCol)); } ExpanderTask expander = new ExpanderTask( iThread, nodes, - columns, + taskColumns, signatures, - new SignatureTrimmerFactory(nodes, nodes.columns(), trimmer), + trimmerFactory, threshold, decreaseFactor, numberOfIterations, diff --git a/src/main/java/org/opendata/curation/d4/domain/Domain.java b/src/main/java/org/opendata/curation/d4/domain/Domain.java index 3faa9d5..5cb1f9a 100644 --- a/src/main/java/org/opendata/curation/d4/domain/Domain.java +++ b/src/main/java/org/opendata/curation/d4/domain/Domain.java @@ -19,7 +19,7 @@ import org.opendata.core.set.IDSet; import org.opendata.core.set.IdentifiableIDSet; -import org.opendata.core.set.SimpleIdentifiableIDSet; +import org.opendata.core.set.IdentifiableIDSetWrapper; /** * A domain is an identifiable object that has a list of column identifier and @@ -29,7 +29,7 @@ * * @author Heiko Mueller */ -public class Domain extends SimpleIdentifiableIDSet implements IdentifiableIDSet { +public class Domain extends IdentifiableIDSetWrapper implements IdentifiableIDSet { private final IDSet _columns; private int[] _nodes; diff --git a/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java index 99a63a1..edbea99 100644 --- a/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java @@ -127,6 +127,7 @@ public void run( ExpandedColumnIndex columnIndex, SignatureBlocksStream signatures, String trimmer, + boolean originalOnly, int threads, boolean verbose, DomainConsumer consumer @@ -146,7 +147,11 @@ public void run( queue = new ConcurrentLinkedQueue<>(columnIndex.columns()); SignatureTrimmerFactory trimmerFactory; - trimmerFactory = new SignatureTrimmerFactory(nodes, columnIndex.toColumns(), trimmer); + trimmerFactory = new SignatureTrimmerFactory( + nodes, + columnIndex.toColumns(originalOnly), + trimmer + ); for (int iThread = 0; iThread < threads; iThread++) { DomainGeneratorTask task = new DomainGeneratorTask( diff --git a/src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java index 3711b87..46c4a4a 100644 --- a/src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java @@ -135,6 +135,7 @@ public void run( ExpandedColumnIndex columnIndex, SignatureBlocksStream signatures, String trimmer, + boolean originalOnly, int threads, boolean verbose, DomainConsumer consumer @@ -169,6 +170,13 @@ public void run( ); } + SignatureTrimmerFactory trimmerFactory = new SignatureTrimmerFactory( + nodes, + columnIndex.toColumns(originalOnly), + trimmer + ); + + for (int iThread = 0; iThread < threads; iThread++) { List columns = new ArrayList<>(); for (int iCol = iThread; iCol < columnList.size(); iCol += threads) { @@ -179,7 +187,7 @@ public void run( nodes, columns, signatures, - new SignatureTrimmerFactory(nodes, columnIndex.toColumns(), trimmer), + trimmerFactory, domains, verbose ); diff --git a/src/main/java/org/opendata/curation/d4/domain/UndirectedDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/UndirectedDomainGenerator.java index 0cc9ac8..b23dcd0 100644 --- a/src/main/java/org/opendata/curation/d4/domain/UndirectedDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/UndirectedDomainGenerator.java @@ -77,6 +77,9 @@ public void consume(SignatureBlocks sig) { } } } + if (this.isComplete()) { + + } } } diff --git a/src/main/java/org/opendata/curation/d4/experiments/SignatureComputationExperiment.java b/src/main/java/org/opendata/curation/d4/experiments/SignatureComputationExperiment.java deleted file mode 100644 index be6d329..0000000 --- a/src/main/java/org/opendata/curation/d4/experiments/SignatureComputationExperiment.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * This file is part of the Data-Driven Domain Discovery Tool (D4). - * - * Copyright (c) 2018-2020 New York University. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.opendata.curation.d4.experiments; - -import java.io.File; -import java.io.PrintWriter; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.logging.Level; -import java.util.logging.Logger; -import org.opendata.core.io.FileSystem; -import org.opendata.curation.d4.signature.SignatureBlocks; -import org.opendata.curation.d4.signature.SignatureBlocksGenerator; -import org.opendata.curation.d4.signature.SignatureBlocksIndex; -import org.opendata.curation.d4.signature.trim.LiberalTrimmer; -import org.opendata.curation.d4.telemetry.TelemetrySet; -import org.opendata.db.column.Column; -import org.opendata.db.eq.EQIndex; - -/** - * Print execution time and signature sizes for context signature blocks - * generation for all columns in a database. - * - * @author Heiko Mueller - */ -public class SignatureComputationExperiment { - - private final static String COMMAND = - "Usage: "; - - private final static Logger LOGGER = Logger - .getLogger(SignatureComputationExperiment.class.getName()); - - public static void main(String[] args) { - - if (args.length != 3) { - System.out.println(COMMAND); - System.exit(-1); - } - - File eqFile = new File(args[0]); - int threads = Integer.parseInt(args[1]); - File outFile = new File(args[2]); - - try (PrintWriter out = FileSystem.openPrintWriter(outFile)) { - EQIndex eqIndex = new EQIndex(eqFile); - for (Column column : eqIndex.columns()) { - TelemetrySet telemetry = new TelemetrySet(); - SignatureBlocksIndex consumer = new SignatureBlocksIndex(); - new SignatureBlocksGenerator(telemetry) - .runWithMaxDrop( - eqIndex, - new ConcurrentLinkedQueue<>(column.toList()), - threads, - true, - new LiberalTrimmer(eqIndex.nodeSizes(), consumer) - ); - long execTime = telemetry.get(SignatureBlocksGenerator.TELEMETRY_ID); - int blockCount = 0; - int nodeCount = 0; - for (SignatureBlocks sig : consumer) { - blockCount += sig.size(); - for (int iBlock = 0; iBlock < sig.size(); iBlock++) { - nodeCount += sig.get(iBlock).length; - } - } - String line = String.format( - "%d\t%d\t%d\t%d\t%d", - column.id(), - execTime, - consumer.length(), - blockCount, - nodeCount - ); - out.println(line); - System.out.println(line); - } - } catch (java.lang.InterruptedException | java.io.IOException ex) { - LOGGER.log(Level.SEVERE, "RUN", ex); - } - } -} diff --git a/src/main/java/org/opendata/curation/d4/prov/CentristBlockUsageWriter.java b/src/main/java/org/opendata/curation/d4/prov/CentristBlockUsageWriter.java deleted file mode 100644 index 42fdfb1..0000000 --- a/src/main/java/org/opendata/curation/d4/prov/CentristBlockUsageWriter.java +++ /dev/null @@ -1,181 +0,0 @@ -/* - * This file is part of the Data-Driven Domain Discovery Tool (D4). - * - * Copyright (c) 2018-2020 New York University. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.opendata.curation.d4.prov; - -import java.io.File; -import java.io.PrintWriter; -import java.util.HashMap; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; -import java.util.logging.Level; -import java.util.logging.Logger; -import org.opendata.core.io.FileSystem; -import org.opendata.core.set.HashIDSet; -import org.opendata.curation.d4.Constants; -import org.opendata.curation.d4.signature.ConcurrentSignatureBlocksStream; -import org.opendata.curation.d4.signature.SignatureBlocks; -import org.opendata.curation.d4.signature.trim.CentristTrimmer; -import org.opendata.curation.d4.signature.trim.PrecisionScore; -import org.opendata.db.column.Column; -import org.opendata.db.eq.EQIndex; - -/** - * For each signature count the blocks (and their size) for those blocks that - * are part of at least one centrist column signature. - * - * @author heiko - */ -public class CentristBlockUsageWriter { - - private class OverlapComputer implements Runnable { - - private final EQIndex _eqIndex; - private final File _outputFile; - private final ConcurrentSignatureBlocksStream _signatures; - private final HashMap _trimmer; - public OverlapComputer( - EQIndex eqIndex, - HashMap trimmer, - ConcurrentSignatureBlocksStream signatures, - File outputFile - ) { - _eqIndex = eqIndex; - _trimmer = trimmer; - _signatures = signatures; - _outputFile = outputFile; - } - - @Override - public void run() { - - try (PrintWriter out = FileSystem.openPrintWriter(_outputFile)) { - SignatureBlocks sig; - while ((sig = _signatures.next()) != null) { - HashIDSet blocks = new HashIDSet(); - for (int columnId : _eqIndex.get(sig.id()).columns()) { - CentristTrimmer trimmer = _trimmer.get(columnId); - blocks.add(trimmer.trimmedBlocks(sig)); - } - int sigSize = 0; - int usedBlocksSize = 0; - for (int iBlock = 0; iBlock < sig.size(); iBlock++) { - int blockLen = sig.get(iBlock).length; - sigSize += blockLen; - if (blocks.contains(iBlock)) { - usedBlocksSize += blockLen; - } - } - out.println( - String.format( - "%d\t%d\t%d\t%d\t%d", - sig.id(), - sig.size(), - blocks.length(), - sigSize, - usedBlocksSize - ) - ); - } - } catch (java.io.IOException ex) { - throw new RuntimeException(ex); - } - } - } - - public void run( - EQIndex eqIndex, - ConcurrentSignatureBlocksStream signatures, - int threads, - File outputDir - ) { - - FileSystem.createFolder(outputDir); - - PrecisionScore scoreFunc = new PrecisionScore(eqIndex); - - HashMap trimmer = new HashMap<>(); - for (Column column : eqIndex.columns()) { - trimmer.put(column.id(), new CentristTrimmer(column, scoreFunc)); - } - - ExecutorService es = Executors.newCachedThreadPool(); - for (int iThread = 0; iThread < threads; iThread++) { - String filename = "centrist-blocks-usage." + iThread + ".tsv.gz"; - File outputFile = FileSystem.joinPath(outputDir, filename); - OverlapComputer thread; - thread = new OverlapComputer(eqIndex, trimmer, signatures, outputFile); - es.execute(thread); - } - es.shutdown(); - try { - es.awaitTermination(threads, TimeUnit.DAYS); - } catch (java.lang.InterruptedException ex) { - throw new RuntimeException(ex); - } - } - - private final static String COMMAND = - "Usage:\n" + - " \n" + - " \n" + - " \n" + - " "; - - private final static Logger LOGGER = Logger - .getLogger(CentristBlockUsageWriter.class.getName()); - - public static void main(String[] args) { - - System.out.println("Centrist Block Usage Writer - Version (" + Constants.VERSION + ")\n"); - - if (args.length != 4) { - System.out.println(COMMAND); - System.exit(-1); - } - - File eqFile = new File(args[0]); - File signatureDir = new File(args[1]); - int threads = Integer.parseInt(args[2]); - File outputDir = new File(args[3]); - - EQIndex eqIndex = null; - try { - eqIndex = new EQIndex(eqFile); - } catch (java.io.IOException ex) { - LOGGER.log(Level.SEVERE, "READ EQs", ex); - System.exit(-1); - } - - ConcurrentSignatureBlocksStream signatures = null; - try { - signatures = new ConcurrentSignatureBlocksStream(signatureDir); - } catch (java.io.IOException ex) { - LOGGER.log(Level.SEVERE, "READ SIGNATURES", ex); - System.exit(-1); - } - - try { - new CentristBlockUsageWriter() - .run(eqIndex, signatures, threads, outputDir); - } catch (java.lang.RuntimeException ex) { - LOGGER.log(Level.SEVERE, "RUN", ex); - System.exit(-1); - } - } -} diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/BlockScoreFunction.java b/src/main/java/org/opendata/curation/d4/signature/trim/BlockScoreFunction.java index bb72b2f..1413ce6 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/BlockScoreFunction.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/BlockScoreFunction.java @@ -19,8 +19,8 @@ import java.math.BigDecimal; import java.util.HashMap; +import org.opendata.core.set.IdentifiableIDSet; import org.opendata.core.set.IdentifiableObjectSet; -import org.opendata.db.column.Column; import org.opendata.db.eq.EQIndex; /** @@ -36,7 +36,7 @@ public abstract class BlockScoreFunction { public BlockScoreFunction( EQIndex eqIndex, - IdentifiableObjectSet columns + IdentifiableObjectSet columns ) { _nodeSize = eqIndex.nodeSizes(); @@ -44,7 +44,7 @@ public BlockScoreFunction( _columns = new HashMap<>(); _columnSize = new HashMap<>(); - for (Column column : columns) { + for (IdentifiableIDSet column : columns) { _columns.put(column.id(), column.toArray()); int size = 0; for (int nodeId : column) { diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/CentristBlockRelevanceFilter.java b/src/main/java/org/opendata/curation/d4/signature/trim/CentristBlockRelevanceFilter.java index e9a3366..f9477cd 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/CentristBlockRelevanceFilter.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/CentristBlockRelevanceFilter.java @@ -32,9 +32,9 @@ import org.opendata.core.prune.CandidateSetFinder; import org.opendata.core.prune.MaxDropFinder; import org.opendata.core.set.IDSet; +import org.opendata.core.set.IdentifiableIDSet; import org.opendata.core.set.IdentifiableObjectSet; import org.opendata.core.sort.DoubleValueDescSort; -import org.opendata.db.column.Column; import org.opendata.db.eq.EQIndex; /** @@ -95,7 +95,7 @@ public CentristBlockRelevanceFilter( public CentristBlockRelevanceFilter( EQIndex eqIndex, - IdentifiableObjectSet columns, + IdentifiableObjectSet columns, SignatureBlocksConsumer consumer ) { diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/MinJIScore.java b/src/main/java/org/opendata/curation/d4/signature/trim/MinJIScore.java index 0edb0b8..9508a43 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/MinJIScore.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/MinJIScore.java @@ -19,8 +19,8 @@ import java.math.BigDecimal; import java.math.MathContext; +import org.opendata.core.set.IdentifiableIDSet; import org.opendata.core.set.IdentifiableObjectSet; -import org.opendata.db.column.Column; import org.opendata.db.eq.EQIndex; /** @@ -31,7 +31,7 @@ */ public class MinJIScore extends BlockScoreFunction { - public MinJIScore(EQIndex eqIndex, IdentifiableObjectSet columns) { + public MinJIScore(EQIndex eqIndex, IdentifiableObjectSet columns) { super(eqIndex, columns); } diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/PrecisionScore.java b/src/main/java/org/opendata/curation/d4/signature/trim/PrecisionScore.java index aef7344..c67f890 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/PrecisionScore.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/PrecisionScore.java @@ -19,8 +19,8 @@ import java.math.BigDecimal; import org.opendata.core.metric.Precision; +import org.opendata.core.set.IdentifiableIDSet; import org.opendata.core.set.IdentifiableObjectSet; -import org.opendata.db.column.Column; import org.opendata.db.eq.EQIndex; /** @@ -29,15 +29,10 @@ */ public class PrecisionScore extends BlockScoreFunction { - public PrecisionScore(EQIndex eqIndex, IdentifiableObjectSet columns) { + public PrecisionScore(EQIndex eqIndex, IdentifiableObjectSet columns) { super(eqIndex, columns); } - - public PrecisionScore(EQIndex eqIndex) { - - super(eqIndex, eqIndex.columns()); - } @Override public BigDecimal relevance(int columnSize, int blockSize, int overlap) { diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java index a147970..953591f 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java @@ -21,7 +21,6 @@ import org.opendata.curation.d4.signature.SignatureBlocksConsumer; import org.opendata.core.set.IdentifiableIDSet; import org.opendata.core.set.IdentifiableObjectSet; -import org.opendata.db.column.Column; import org.opendata.db.eq.EQIndex; /** @@ -31,14 +30,14 @@ */ public class SignatureTrimmerFactory { - private final IdentifiableObjectSet _columns; + private final IdentifiableObjectSet _columns; private final EQIndex _nodes; private PrecisionScore _scoreFunc = null; private final String _trimmerSpec; public SignatureTrimmerFactory( EQIndex nodes, - IdentifiableObjectSet columns, + IdentifiableObjectSet columns, String trimmerSpec ) { _nodes = nodes; diff --git a/src/main/java/org/opendata/db/eq/EQIndex.java b/src/main/java/org/opendata/db/eq/EQIndex.java index 9b990af..7deebd7 100644 --- a/src/main/java/org/opendata/db/eq/EQIndex.java +++ b/src/main/java/org/opendata/db/eq/EQIndex.java @@ -22,6 +22,7 @@ import org.opendata.core.io.FileSystem; import org.opendata.core.prune.SizeFunction; import org.opendata.core.set.HashObjectSet; +import org.opendata.core.set.IdentifiableIDSet; import org.opendata.core.set.IdentifiableObjectSet; import org.opendata.core.util.count.IdentifiableCount; import org.opendata.core.util.count.IdentifiableCounterSet; @@ -63,9 +64,13 @@ public int[] columnSizes() { return values; } - public IdentifiableObjectSet columns() { + public IdentifiableObjectSet columns() { - return new Database(this).columns(); + HashObjectSet columns = new HashObjectSet<>(); + for (Column column : new Database(this).columns()) { + columns.add(column); + } + return columns; } @Override From 6593d20637e49c0af2e4e0098a6b3e64177759e6 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Sat, 28 Nov 2020 11:48:12 -0500 Subject: [PATCH 14/25] Add local domain early termination option --- .../org/opendata/curation/d4/Constants.java | 2 +- .../curation/d4/column/ColumnExpander.java | 10 ++------ .../curation/d4/column/ExpandedColumn.java | 5 ++++ .../d4/column/ParallelColumnExpander.java | 5 +--- .../domain/MultiScanLocalDomainGenerator.java | 24 ++++++++++++++----- .../SingleScanLocalDomainGenerator.java | 2 +- .../d4/domain/UndirectedDomainGenerator.java | 9 ++++++- .../d4/signature/RobustSignatureIndex.java | 6 +++++ .../d4/signature/SignatureBlocksBuffer.java | 6 +++++ .../d4/signature/SignatureBlocksConsumer.java | 1 + .../signature/SignatureBlocksDispatcher.java | 11 +++++++++ .../d4/signature/SignatureBlocksIndex.java | 10 ++++++++ .../d4/signature/SignatureBlocksStats.java | 6 +++++ .../d4/signature/SignatureBlocksWriter.java | 6 +++++ .../d4/signature/trim/SignatureTrimmer.java | 6 +++++ .../trim/SignatureTrimmerFactory.java | 8 +++---- 16 files changed, 91 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/opendata/curation/d4/Constants.java b/src/main/java/org/opendata/curation/d4/Constants.java index e00ecc4..7878847 100644 --- a/src/main/java/org/opendata/curation/d4/Constants.java +++ b/src/main/java/org/opendata/curation/d4/Constants.java @@ -26,5 +26,5 @@ public final class Constants { public static final String NAME = "D4 - Data-Driven Domain Discovery"; - public static final String VERSION = "0.29.0.dev8"; + public static final String VERSION = "0.29.0.dev9"; } \ No newline at end of file diff --git a/src/main/java/org/opendata/curation/d4/column/ColumnExpander.java b/src/main/java/org/opendata/curation/d4/column/ColumnExpander.java index ae9e0bb..b0e806e 100644 --- a/src/main/java/org/opendata/curation/d4/column/ColumnExpander.java +++ b/src/main/java/org/opendata/curation/d4/column/ColumnExpander.java @@ -89,10 +89,7 @@ public void run() { SignatureTrimmer trimmer; trimmer = _trimmerFactory .getTrimmer( - new MutableIdentifiableIDSet( - column.id(), - column.originalNodes() - ), + column.id(), columnExpander ); dispatcher.add(trimmer); @@ -132,10 +129,7 @@ public void run() { SignatureTrimmer trimmer; trimmer = _trimmerFactory .getTrimmer( - new MutableIdentifiableIDSet( - expander.column().id(), - expander.column().nodes() - ), + expander.column().id(), expander ); dispatcher.add(trimmer); diff --git a/src/main/java/org/opendata/curation/d4/column/ExpandedColumn.java b/src/main/java/org/opendata/curation/d4/column/ExpandedColumn.java index 4636429..d345445 100644 --- a/src/main/java/org/opendata/curation/d4/column/ExpandedColumn.java +++ b/src/main/java/org/opendata/curation/d4/column/ExpandedColumn.java @@ -68,4 +68,9 @@ public IDSet originalNodes() { return _nodes; } + + public int totalSize() { + + return _nodes.length() + _expansion.length(); + } } diff --git a/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java b/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java index 593bd87..f9c7169 100644 --- a/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java +++ b/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java @@ -124,10 +124,7 @@ public void run() { SignatureTrimmer trimmer; trimmer = _trimmerFactory .getTrimmer( - new MutableIdentifiableIDSet( - expander.column().id(), - expander.column().nodes() - ), + expander.column().id(), expander ); dispatcher.add(trimmer); diff --git a/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java index edbea99..9f1c2b2 100644 --- a/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java @@ -17,7 +17,10 @@ */ package org.opendata.curation.d4.domain; +import java.util.Collections; +import java.util.Comparator; import java.util.Date; +import java.util.List; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -83,6 +86,7 @@ public void run() { ExpandedColumn column; while ((column = _columns.poll()) != null) { + System.out.print(column.id() + " (" + column.totalSize() + "): " + new Date()); MutableIdentifiableIDSet col; col = new MutableIdentifiableIDSet(column.id(), column.nodes()); SignatureBlocksConsumer domainGenerator; @@ -92,7 +96,7 @@ public void run() { _nodes.nodeSizes() ); SignatureTrimmer trimmer; - trimmer = _trimmerFactory.getTrimmer(col, domainGenerator); + trimmer = _trimmerFactory.getTrimmer(col.id(), domainGenerator); try { _signatures.stream(trimmer); } catch (java.io.IOException ex) { @@ -136,15 +140,18 @@ public void run( UniqueDomainSet domains = new UniqueDomainSet(columnIndex); Date start = new Date(); - if (verbose) { - System.out.println("START @ " + start); - new MemUsagePrinter().print(); - } ExecutorService es = Executors.newCachedThreadPool(); + List columnList = columnIndex.columns(); + Collections.sort(columnList, new Comparator(){ + @Override + public int compare(ExpandedColumn col1, ExpandedColumn col2) { + return Integer.compare(col2.totalSize(), col1.totalSize()); + } + }); ConcurrentLinkedQueue queue; - queue = new ConcurrentLinkedQueue<>(columnIndex.columns()); + queue = new ConcurrentLinkedQueue<>(columnList); SignatureTrimmerFactory trimmerFactory; trimmerFactory = new SignatureTrimmerFactory( @@ -152,6 +159,11 @@ public void run( columnIndex.toColumns(originalOnly), trimmer ); + + if (verbose) { + System.out.println("START @ " + start); + new MemUsagePrinter().print(); + } for (int iThread = 0; iThread < threads; iThread++) { DomainGeneratorTask task = new DomainGeneratorTask( diff --git a/src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java index 46c4a4a..2353d8e 100644 --- a/src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java @@ -96,7 +96,7 @@ public void run() { _nodes.nodeSizes() ); SignatureTrimmer trimmer; - trimmer = _trimmerFactory.getTrimmer(col, domainGenerator); + trimmer = _trimmerFactory.getTrimmer(col.id(), domainGenerator); dispatcher.add(trimmer); } diff --git a/src/main/java/org/opendata/curation/d4/domain/UndirectedDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/UndirectedDomainGenerator.java index b23dcd0..088e65c 100644 --- a/src/main/java/org/opendata/curation/d4/domain/UndirectedDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/UndirectedDomainGenerator.java @@ -33,6 +33,7 @@ public class UndirectedDomainGenerator extends UndirectedConnectedComponents implements SignatureBlocksConsumer { private final ExpandedColumn _column; + private boolean _isDone = false; private final int[] _nodeSizes; private final UniqueDomainSet _resultSet; @@ -78,11 +79,17 @@ public void consume(SignatureBlocks sig) { } } if (this.isComplete()) { - + _isDone = true; } } } + @Override + public boolean isDone() { + + return _isDone; + } + @Override public void open() { diff --git a/src/main/java/org/opendata/curation/d4/signature/RobustSignatureIndex.java b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureIndex.java index 78152ee..f73799b 100644 --- a/src/main/java/org/opendata/curation/d4/signature/RobustSignatureIndex.java +++ b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureIndex.java @@ -48,6 +48,12 @@ public SignatureBlocks get(int id) { return _elements.get(id); } + + @Override + public boolean isDone() { + + return false; + } @Override public void open() { diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksBuffer.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksBuffer.java index 71890c5..8d25bbb 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksBuffer.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksBuffer.java @@ -50,6 +50,12 @@ public SignatureBlocks get(int index) { return _signatures.get(index); } + + @Override + public boolean isDone() { + + return false; + } @Override public Iterator iterator() { diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumer.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumer.java index 61dedbd..8cfbf85 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumer.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumer.java @@ -26,5 +26,6 @@ public interface SignatureBlocksConsumer { public void close(); public void consume(SignatureBlocks sig); + public boolean isDone(); public void open(); } diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksDispatcher.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksDispatcher.java index b29ed0a..5398e26 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksDispatcher.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksDispatcher.java @@ -61,6 +61,17 @@ public void consume(SignatureBlocks sig) { } } + @Override + public boolean isDone() { + + for (SignatureBlocksConsumer consumer : _consumers) { + if (consumer.isDone()) { + return true; + } + } + return false; + } + @Override public void open() { diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndex.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndex.java index 574b8de..c36bf12 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndex.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndex.java @@ -51,6 +51,12 @@ public SignatureBlocks get(int id) { return _signatures.get(id); } + + @Override + public boolean isDone() { + + return false; + } @Override public Iterator iterator() { @@ -75,6 +81,10 @@ public void stream(SignatureBlocksConsumer consumer) { for (SignatureBlocks sig : _signatures) { consumer.consume(sig); + if (consumer.isDone()) { + System.out.println("Terminate early"); + break; + } } consumer.close(); diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStats.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStats.java index cbd559e..82de5d3 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStats.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStats.java @@ -107,6 +107,12 @@ public void consume(SignatureBlocks sig) { } } + @Override + public boolean isDone() { + + return false; + } + @Override public void open() { diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriter.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriter.java index 7e7ec19..87f1560 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriter.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriter.java @@ -60,6 +60,12 @@ public void consume(SignatureBlocks sig) { } } + @Override + public boolean isDone() { + + return false; + } + @Override public synchronized void open() { diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmer.java b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmer.java index a6993ce..4a2763e 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmer.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmer.java @@ -80,6 +80,12 @@ public void consume(SignatureBlocks sig) { } } + @Override + public boolean isDone() { + + return _consumer.isDone(); + } + @Override public void open() { diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java index 953591f..7cf6d4c 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java @@ -65,15 +65,13 @@ public SignatureTrimmer getTrimmer(SignatureBlocksConsumer consumer) { * Get column specific trimmer for a given column. We currently do not make * use of the empty signature constraint. * - * @param column + * @param columnId * @param consumer * @return */ - public SignatureTrimmer getTrimmer( - IdentifiableIDSet column, - SignatureBlocksConsumer consumer - ) { + public SignatureTrimmer getTrimmer(int columnId, SignatureBlocksConsumer consumer) { + IdentifiableIDSet column = _columns.get(columnId); if (_trimmerSpec.equals(SignatureTrimmer.CONSERVATIVE)) { return new ConservativeTrimmer(column, consumer); } else if (_trimmerSpec.equals(SignatureTrimmer.CENTRIST)) { From 977dd7fdf32edc2cd3d290b8d9051289563a2e93 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Sat, 28 Nov 2020 12:31:48 -0500 Subject: [PATCH 15/25] Change verbose output for local domain discovery --- .../org/opendata/curation/d4/Constants.java | 2 +- .../domain/MultiScanLocalDomainGenerator.java | 20 ++++++++++++-- .../SingleScanLocalDomainGenerator.java | 26 ++++++++++++++----- .../d4/signature/SignatureBlocksIndex.java | 1 - 4 files changed, 39 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/opendata/curation/d4/Constants.java b/src/main/java/org/opendata/curation/d4/Constants.java index 7878847..d7b0bc2 100644 --- a/src/main/java/org/opendata/curation/d4/Constants.java +++ b/src/main/java/org/opendata/curation/d4/Constants.java @@ -26,5 +26,5 @@ public final class Constants { public static final String NAME = "D4 - Data-Driven Domain Discovery"; - public static final String VERSION = "0.29.0.dev9"; + public static final String VERSION = "0.29.0.dev10"; } \ No newline at end of file diff --git a/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java index 9f1c2b2..b8e02d0 100644 --- a/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java @@ -25,6 +25,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; +import java.util.logging.Level; import org.opendata.curation.d4.telemetry.TelemetryCollector; import org.opendata.curation.d4.telemetry.TelemetryPrinter; import org.opendata.curation.d4.column.ExpandedColumn; @@ -86,7 +87,6 @@ public void run() { ExpandedColumn column; while ((column = _columns.poll()) != null) { - System.out.print(column.id() + " (" + column.totalSize() + "): " + new Date()); MutableIdentifiableIDSet col; col = new MutableIdentifiableIDSet(column.id(), column.nodes()); SignatureBlocksConsumer domainGenerator; @@ -97,11 +97,14 @@ public void run() { ); SignatureTrimmer trimmer; trimmer = _trimmerFactory.getTrimmer(col.id(), domainGenerator); + Date runStart = new Date(); try { _signatures.stream(trimmer); } catch (java.io.IOException ex) { throw new RuntimeException(ex); } + Date runEnd = new Date(); + System.out.println(column.id() + " (" + column.totalSize() + "): " + (runEnd.getTime() - runStart.getTime()) + " ms"); } Date end = new Date(); @@ -161,7 +164,20 @@ public int compare(ExpandedColumn col1, ExpandedColumn col2) { ); if (verbose) { - System.out.println("START @ " + start); + System.out.println( + String.format( + "LOCAL DOMAINS FOR %d COLUMN GROUPS USING:\n" + + " --trimmer=%s\n" + + " --originalonly=%s\n" + + " --threads=%d\n" + + " --singlescan=false", + columnList.size(), + trimmer, + Boolean.toString(originalOnly), + threads + ) + ); + System.out.println(String.format("START @ %s", start)); new MemUsagePrinter().print(); } diff --git a/src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java index 2353d8e..1a2819f 100644 --- a/src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java @@ -144,12 +144,7 @@ public void run( UniqueDomainSet domains = new UniqueDomainSet(columnIndex); Date start = new Date(); - if (verbose) { - System.out.println("START @ " + start); - } - - ExecutorService es = Executors.newCachedThreadPool(); - + // Sort column in decreasing number of nodes List columnList = new ArrayList<>(columnIndex.columns()); Collections.sort(columnList, (ExpandedColumn c1, ExpandedColumn c2) -> @@ -157,6 +152,25 @@ public void run( ); Collections.reverse(columnList); + if (verbose) { + System.out.println( + String.format( + "LOCAL DOMAINS FOR %d COLUMN GROUPS USING:\n" + + " --trimmer=%s\n" + + " --originalonly=%s\n" + + " --threads=%d\n" + + " --singlescan=false", + columnList.size(), + trimmer, + Boolean.toString(originalOnly), + threads + ) + ); + System.out.println("START @ " + start); + } + + ExecutorService es = Executors.newCachedThreadPool(); + if (verbose) { System.out.println( String.format( diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndex.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndex.java index c36bf12..3568a02 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndex.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndex.java @@ -82,7 +82,6 @@ public void stream(SignatureBlocksConsumer consumer) { for (SignatureBlocks sig : _signatures) { consumer.consume(sig); if (consumer.isDone()) { - System.out.println("Terminate early"); break; } } From 8a879d15cff71b7d988d852169dffdd43d336965 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Mon, 30 Nov 2020 10:04:31 -0500 Subject: [PATCH 16/25] Add stats tools --- pom.xml | 7 +- .../java/org/opendata/curation/d4/D4.java | 4 +- ...ts.java => ExpandedColumnStatsWriter.java} | 35 +++++--- .../d4/column/ParallelColumnExpander.java | 3 - .../d4/export/EQStatsLoadFileWriter.java | 64 ++++++++++++++ .../db/eq/EQColumnCountHistorgramWriter.java | 84 +++++++++++++++++++ 6 files changed, 178 insertions(+), 19 deletions(-) rename src/main/java/org/opendata/curation/d4/column/{ExpandedColumnStats.java => ExpandedColumnStatsWriter.java} (75%) create mode 100644 src/main/java/org/opendata/curation/d4/export/EQStatsLoadFileWriter.java create mode 100644 src/main/java/org/opendata/db/eq/EQColumnCountHistorgramWriter.java diff --git a/pom.xml b/pom.xml index 6aea5eb..fa03c48 100644 --- a/pom.xml +++ b/pom.xml @@ -70,12 +70,13 @@ true - org.opendata.curation.d4.D4 - + + + org.opendata.curation.d4.export.EQStatsLoadFileWriter - + diff --git a/src/main/java/org/opendata/curation/d4/D4.java b/src/main/java/org/opendata/curation/d4/D4.java index 4905e24..4c32ee3 100644 --- a/src/main/java/org/opendata/curation/d4/D4.java +++ b/src/main/java/org/opendata/curation/d4/D4.java @@ -28,7 +28,7 @@ import java.util.logging.Logger; import org.opendata.curation.d4.column.ExpandedColumnIndex; import org.opendata.curation.d4.column.ExpandedColumnReader; -import org.opendata.curation.d4.column.ExpandedColumnStats; +import org.opendata.curation.d4.column.ExpandedColumnStatsWriter; import org.opendata.curation.d4.column.ExpandedColumnWriterFactory; import org.opendata.curation.d4.column.ParallelColumnExpander; import org.opendata.curation.d4.domain.DomainReader; @@ -98,7 +98,7 @@ public void expandColumns( ); if (verbose) { - ExpandedColumnStats colStats = new ExpandedColumnStats(); + ExpandedColumnStatsWriter colStats = new ExpandedColumnStatsWriter(); new ExpandedColumnReader(outputFile).stream(colStats); colStats.print(); } diff --git a/src/main/java/org/opendata/curation/d4/column/ExpandedColumnStats.java b/src/main/java/org/opendata/curation/d4/column/ExpandedColumnStatsWriter.java similarity index 75% rename from src/main/java/org/opendata/curation/d4/column/ExpandedColumnStats.java rename to src/main/java/org/opendata/curation/d4/column/ExpandedColumnStatsWriter.java index c4105c9..a3bd362 100644 --- a/src/main/java/org/opendata/curation/d4/column/ExpandedColumnStats.java +++ b/src/main/java/org/opendata/curation/d4/column/ExpandedColumnStatsWriter.java @@ -21,6 +21,7 @@ import java.io.PrintWriter; import java.util.logging.Level; import java.util.logging.Logger; +import org.opendata.core.io.FileSystem; import org.opendata.curation.d4.Constants; import org.opendata.core.util.Avg; @@ -31,7 +32,7 @@ * * @author Heiko Mueller */ -public class ExpandedColumnStats implements ExpandedColumnConsumer { +public class ExpandedColumnStatsWriter implements ExpandedColumnConsumer { private int _columnCount = 0; private int _expandedCount = 0; @@ -96,27 +97,39 @@ public void print() { private static final String COMMAND = "Usage:\n" + - " "; + " \n" + + " "; private static final Logger LOGGER = Logger - .getLogger(ExpandedColumnStats.class.getName()); + .getLogger(ExpandedColumnStatsWriter.class.getName()); public static void main(String[] args) { - System.out.println(Constants.NAME + " - Expanded Column Stats - Version (" + Constants.VERSION + ")\n"); + System.out.println(Constants.NAME + " - Expanded Column Stats Writer - Version (" + Constants.VERSION + ")\n"); - if (args.length != 1) { + if (args.length != 2) { System.out.println(COMMAND); System.exit(-1); } - File columnFile = new File(args[0]); + File columnsFile = new File(args[0]); + File outputFile = new File(args[1]); - ExpandedColumnStats consumer = new ExpandedColumnStats(); - - try (PrintWriter out = new PrintWriter(System.out)) { - new ExpandedColumnReader(columnFile).stream(consumer); - consumer.print(out); + try (PrintWriter out = FileSystem.openPrintWriter(outputFile)) { + ExpandedColumnIndex columnIndex = new ExpandedColumnIndex(); + new ExpandedColumnReader(columnsFile).stream(columnIndex); + for (ExpandedColumn column : columnIndex.columns()) { + int colCount = columnIndex.columns(column.id()).length(); + String line = String.format( + "%d\t%d\t%d\t%d", + column.id(), + colCount, + column.originalNodes().length(), + column.expansionSize() + ); + out.println(line); + System.out.println(line); + } } catch (java.io.IOException ex) { LOGGER.log(Level.SEVERE, "RUN", ex); System.exit(-1); diff --git a/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java b/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java index f9c7169..c985b94 100644 --- a/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java +++ b/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java @@ -35,11 +35,8 @@ import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; import org.opendata.core.constraint.Threshold; import org.opendata.core.set.HashIDSet; -import org.opendata.core.set.HashObjectSet; import org.opendata.core.set.IDSet; -import org.opendata.core.set.IdentifiableIDSet; import org.opendata.core.set.IdentifiableObjectSet; -import org.opendata.core.set.MutableIdentifiableIDSet; import org.opendata.core.util.MemUsagePrinter; import org.opendata.curation.d4.signature.SignatureBlocksDispatcher; import org.opendata.db.column.Column; diff --git a/src/main/java/org/opendata/curation/d4/export/EQStatsLoadFileWriter.java b/src/main/java/org/opendata/curation/d4/export/EQStatsLoadFileWriter.java new file mode 100644 index 0000000..c2133dc --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/export/EQStatsLoadFileWriter.java @@ -0,0 +1,64 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.export; + +import java.io.File; +import java.io.PrintWriter; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.opendata.core.io.FileSystem; +import org.opendata.db.eq.EQ; +import org.opendata.db.eq.EQIndex; + +/** + * Write load file containing number of terms and columns for each equivalence + * class. + * + * @author Heiko Mueller + */ +public class EQStatsLoadFileWriter { + + private final static String COMMAND = + "Usage:\n" + + " \n" + + " "; + + private final static Logger LOGGER = Logger + .getLogger(EQStatsLoadFileWriter.class.getName()); + + public static void main(String[] args) { + + if (args.length != 2) { + System.out.println(COMMAND); + System.exit(-1); + } + + File eqFile = new File(args[0]); + File outFile = new File(args[1]); + + try (PrintWriter out = FileSystem.openPrintWriter(outFile)) { + EQIndex eqIndex = new EQIndex(eqFile); + for (EQ eq : eqIndex) { + out.println(eq.id() + "\t" + eq.columns().length() + "\t" + eq.terms().length()); + } + } catch (java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "RUN", ex); + System.exit(-1); + } + } +} diff --git a/src/main/java/org/opendata/db/eq/EQColumnCountHistorgramWriter.java b/src/main/java/org/opendata/db/eq/EQColumnCountHistorgramWriter.java new file mode 100644 index 0000000..061812f --- /dev/null +++ b/src/main/java/org/opendata/db/eq/EQColumnCountHistorgramWriter.java @@ -0,0 +1,84 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.db.eq; + +import java.io.File; +import java.io.PrintWriter; +import java.util.HashMap; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.opendata.core.io.FileSystem; + +/** + * Write histogram of column counts for all equivalence classes and terms. + * + * @author Heiko Mueller + */ +public class EQColumnCountHistorgramWriter { + + private final static String COMMAND = + "Usage:\n" + + " \n" + + " "; + + private final static Logger LOGGER = Logger + .getLogger(EQColumnCountHistorgramWriter.class.getName()); + + public static void main(String[] args) { + + if (args.length != 2) { + System.out.println(COMMAND); + System.exit(-1); + } + + File eqFile = new File(args[0]); + File outFile = new File(args[1]); + + try (PrintWriter out = FileSystem.openPrintWriter(outFile)) { + HashMap histogram = new HashMap<>(); + int maxColCount = 0; + EQIndex eqIndex = new EQIndex(eqFile); + for (EQ eq : eqIndex) { + int colCount = eq.columns().length(); + int termCount = eq.terms().length(); + if (histogram.containsKey(colCount)) { + int[] entry = histogram.get(colCount); + entry[0]++; + entry[1] += termCount; + } else { + histogram.put(colCount, new int[]{1, termCount}); + } + if (colCount > maxColCount) { + maxColCount = colCount; + } + } + for (int iCount = 1; iCount <= maxColCount; iCount++) { + int[] counts; + if (histogram.containsKey(iCount)) { + counts = histogram.get(iCount); + } else { + counts = new int[]{0, 0}; + } + out.println(iCount + "\t" + counts[0] + "\t" + counts[1]); + } + } catch (java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "RUN", ex); + System.exit(-1); + } + } +} From 9efe12f01e6452ffe148eafe2296941eab2b5968 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Sun, 6 Dec 2020 12:12:37 -0500 Subject: [PATCH 17/25] Output of data collection source names and consumer target names --- pom.xml | 5 +- .../io/prov/DataCollection.java} | 13 +- .../org/opendata/core/io/prov/DataSink.java | 28 +++ .../org/opendata/curation/d4/Constants.java | 2 +- .../java/org/opendata/curation/d4/D4.java | 45 ++--- .../curation/d4/column/ColumnExpander.java | 1 - .../d4/column/ExpandedColumnIndex.java | 16 +- .../d4/column/ExpandedColumnStatsWriter.java | 2 +- .../column/ExpandedColumnWriterFactory.java | 62 ------ .../d4/column/ParallelColumnExpander.java | 15 +- .../curation/d4/domain/DomainConsumer.java | 4 +- .../d4/domain/DomainSetStatsPrinter.java | 6 + .../curation/d4/domain/DomainWriter.java | 6 + ...a => ExternalMemLocalDomainGenerator.java} | 20 +- ...or.java => InMemLocalDomainGenerator.java} | 20 +- .../curation/d4/domain/StrongDomain.java | 85 ++++++++ .../d4/experiments/BestGTAllMatch.java | 104 ---------- .../d4/experiments/BestGTStrongMatch.java | 191 ++++++++++++++++++ .../d4/export/ExportStrongDomains.java | 34 +--- .../d4/signature/RobustSignatureIndex.java | 73 ------- .../d4/signature/SignatureBlocksBuffer.java | 49 +---- .../d4/signature/SignatureBlocksIndex.java | 12 ++ .../d4/signature/SignatureBlocksReader.java | 16 +- .../d4/signature/SignatureBlocksStream.java | 4 +- .../org/opendata/db/column/ColumnPrinter.java | 70 +++++++ src/main/java/org/opendata/db/eq/EQIndex.java | 16 +- 26 files changed, 530 insertions(+), 369 deletions(-) rename src/main/java/org/opendata/{curation/d4/column/ExpandedColumnConsumerFactory.java => core/io/prov/DataCollection.java} (72%) create mode 100644 src/main/java/org/opendata/core/io/prov/DataSink.java delete mode 100644 src/main/java/org/opendata/curation/d4/column/ExpandedColumnWriterFactory.java rename src/main/java/org/opendata/curation/d4/domain/{SingleScanLocalDomainGenerator.java => ExternalMemLocalDomainGenerator.java} (91%) rename src/main/java/org/opendata/curation/d4/domain/{MultiScanLocalDomainGenerator.java => InMemLocalDomainGenerator.java} (91%) delete mode 100644 src/main/java/org/opendata/curation/d4/experiments/BestGTAllMatch.java create mode 100644 src/main/java/org/opendata/curation/d4/experiments/BestGTStrongMatch.java delete mode 100644 src/main/java/org/opendata/curation/d4/signature/RobustSignatureIndex.java create mode 100644 src/main/java/org/opendata/db/column/ColumnPrinter.java diff --git a/pom.xml b/pom.xml index fa03c48..b1f4f8e 100644 --- a/pom.xml +++ b/pom.xml @@ -70,14 +70,13 @@ true - + org.opendata.curation.d4.D4 - org.opendata.curation.d4.export.EQStatsLoadFileWriter - + diff --git a/src/main/java/org/opendata/curation/d4/column/ExpandedColumnConsumerFactory.java b/src/main/java/org/opendata/core/io/prov/DataCollection.java similarity index 72% rename from src/main/java/org/opendata/curation/d4/column/ExpandedColumnConsumerFactory.java rename to src/main/java/org/opendata/core/io/prov/DataCollection.java index 0f0a5e3..8027909 100644 --- a/src/main/java/org/opendata/curation/d4/column/ExpandedColumnConsumerFactory.java +++ b/src/main/java/org/opendata/core/io/prov/DataCollection.java @@ -15,17 +15,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.opendata.curation.d4.column; - -import java.util.HashMap; -import org.opendata.core.set.HashIDSet; +package org.opendata.core.io.prov; /** - * Factory pattern for column consumers. + * Simple interface to manage the information source for different types of + * object collections. These collections may either be read from disk or + * generated in memory. * * @author Heiko Mueller */ -public interface ExpandedColumnConsumerFactory { +public interface DataCollection { - public ExpandedColumnConsumer getConsumer(HashMap groups); + public String source(); } diff --git a/src/main/java/org/opendata/core/io/prov/DataSink.java b/src/main/java/org/opendata/core/io/prov/DataSink.java new file mode 100644 index 0000000..2c61762 --- /dev/null +++ b/src/main/java/org/opendata/core/io/prov/DataSink.java @@ -0,0 +1,28 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.core.io.prov; + +/** + * Simple interface to manage the data sink name for data generators. + * + * @author Heiko Mueller + */ +public interface DataSink { + + public String target(); +} diff --git a/src/main/java/org/opendata/curation/d4/Constants.java b/src/main/java/org/opendata/curation/d4/Constants.java index d7b0bc2..b8dbef1 100644 --- a/src/main/java/org/opendata/curation/d4/Constants.java +++ b/src/main/java/org/opendata/curation/d4/Constants.java @@ -26,5 +26,5 @@ public final class Constants { public static final String NAME = "D4 - Data-Driven Domain Discovery"; - public static final String VERSION = "0.29.0.dev10"; + public static final String VERSION = "0.29.0.dev12"; } \ No newline at end of file diff --git a/src/main/java/org/opendata/curation/d4/D4.java b/src/main/java/org/opendata/curation/d4/D4.java index 4c32ee3..e6c6151 100644 --- a/src/main/java/org/opendata/curation/d4/D4.java +++ b/src/main/java/org/opendata/curation/d4/D4.java @@ -29,12 +29,11 @@ import org.opendata.curation.d4.column.ExpandedColumnIndex; import org.opendata.curation.d4.column.ExpandedColumnReader; import org.opendata.curation.d4.column.ExpandedColumnStatsWriter; -import org.opendata.curation.d4.column.ExpandedColumnWriterFactory; import org.opendata.curation.d4.column.ParallelColumnExpander; import org.opendata.curation.d4.domain.DomainReader; import org.opendata.curation.d4.domain.DomainSetStatsPrinter; import org.opendata.curation.d4.domain.DomainWriter; -import org.opendata.curation.d4.domain.SingleScanLocalDomainGenerator; +import org.opendata.curation.d4.domain.ExternalMemLocalDomainGenerator; import org.opendata.curation.d4.domain.StrongDomainGenerator; import org.opendata.curation.d4.signature.SignatureBlocksGenerator; import org.opendata.curation.d4.signature.SignatureBlocksStats; @@ -43,7 +42,7 @@ import org.opendata.core.io.FileSystem; import org.opendata.core.set.IdentifiableObjectSet; import org.opendata.curation.d4.domain.Domain; -import org.opendata.curation.d4.domain.MultiScanLocalDomainGenerator; +import org.opendata.curation.d4.domain.InMemLocalDomainGenerator; import org.opendata.curation.d4.domain.StrongDomain; import org.opendata.curation.d4.domain.StrongDomainReader; import org.opendata.curation.d4.domain.StrongDomainWriter; @@ -94,7 +93,7 @@ public void expandColumns( numberOfIterations, threads, verbose, - new ExpandedColumnWriterFactory(outputFile, false) + outputFile ); if (verbose) { @@ -130,16 +129,27 @@ public void localDomains( String trimmer, boolean originalOnly, int threads, - boolean singleScan, + boolean inMem, boolean verbose, TelemetryCollector telemetry, File outputFile ) throws java.io.IOException { - ExpandedColumnIndex columnIndex = new ExpandedColumnIndex(); + ExpandedColumnIndex columnIndex = new ExpandedColumnIndex(columnsFile); new ExpandedColumnReader(columnsFile).stream(columnIndex); - if (singleScan) { - new SingleScanLocalDomainGenerator(telemetry).run( + if (inMem) { + new InMemLocalDomainGenerator(telemetry).run( + nodeIndex, + columnIndex, + signatures.read(), + trimmer, + originalOnly, + threads, + verbose, + new DomainWriter(outputFile) + ); + } else { + new ExternalMemLocalDomainGenerator(telemetry).run( nodeIndex, columnIndex, signatures, @@ -149,19 +159,8 @@ public void localDomains( verbose, new DomainWriter(outputFile) ); - } else { - new MultiScanLocalDomainGenerator(telemetry).run( - nodeIndex, - columnIndex, - signatures.read(), - trimmer, - originalOnly, - threads, - verbose, - new DomainWriter(outputFile) - ); } - + if (verbose) { DomainSetStatsPrinter localStats = new DomainSetStatsPrinter(); new DomainReader(outputFile).stream(localStats); @@ -485,7 +484,7 @@ public static void main(String[] args) { ), new Parameter("originalonly", " [default: false]"), new Parameter("threads", " [default: 6]"), - new Parameter("singlescan", " [default: false]"), + new Parameter("inmem", " [default: false]"), new Parameter("verbose", " [default: true]"), new Parameter( "localdomains", @@ -500,7 +499,7 @@ public static void main(String[] args) { String trimmer = params.getAsString("trimmer", SignatureTrimmer.CENTRIST); boolean originalOnly = params.getAsBool("originalonly", false); int threads = params.getAsInt("threads", 6); - boolean singleScan = params.getAsBool("singlescan", false); + boolean inMem = params.getAsBool("inmem", false); boolean verbose = params.getAsBool("verbose", true); File localDomainFile = params.getAsFile("localdomains", "local-domains.txt.gz"); try { @@ -511,7 +510,7 @@ public static void main(String[] args) { trimmer, originalOnly, threads, - singleScan, + inMem, verbose, new TelemetryPrinter(), localDomainFile diff --git a/src/main/java/org/opendata/curation/d4/column/ColumnExpander.java b/src/main/java/org/opendata/curation/d4/column/ColumnExpander.java index b0e806e..aade395 100644 --- a/src/main/java/org/opendata/curation/d4/column/ColumnExpander.java +++ b/src/main/java/org/opendata/curation/d4/column/ColumnExpander.java @@ -26,7 +26,6 @@ import org.opendata.curation.d4.signature.trim.SignatureTrimmer; import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; import org.opendata.core.constraint.Threshold; -import org.opendata.core.set.MutableIdentifiableIDSet; import org.opendata.db.eq.EQIndex; /** diff --git a/src/main/java/org/opendata/curation/d4/column/ExpandedColumnIndex.java b/src/main/java/org/opendata/curation/d4/column/ExpandedColumnIndex.java index a6b8da3..2aa4149 100644 --- a/src/main/java/org/opendata/curation/d4/column/ExpandedColumnIndex.java +++ b/src/main/java/org/opendata/curation/d4/column/ExpandedColumnIndex.java @@ -17,9 +17,11 @@ */ package org.opendata.curation.d4.column; +import java.io.File; import java.util.ArrayList; import java.util.HashMap; import java.util.List; +import org.opendata.core.io.prov.DataCollection; import org.opendata.core.set.HashIDSet; import org.opendata.core.set.HashObjectSet; import org.opendata.core.set.IDSet; @@ -35,11 +37,17 @@ * * @author Heiko Mueller */ -public class ExpandedColumnIndex implements ExpandedColumnConsumer { +public class ExpandedColumnIndex implements DataCollection, ExpandedColumnConsumer { private HashMap _columnIndex; private List _columnList = null; private HashMap _columnMapping; + private final String _source; + + public ExpandedColumnIndex(File file) { + + _source = file.getName(); + } @Override public void close() { @@ -105,4 +113,10 @@ public IdentifiableObjectSet toColumns(boolean originalOnly) return result; } + + @Override + public String source() { + + return _source; + } } diff --git a/src/main/java/org/opendata/curation/d4/column/ExpandedColumnStatsWriter.java b/src/main/java/org/opendata/curation/d4/column/ExpandedColumnStatsWriter.java index a3bd362..707c02b 100644 --- a/src/main/java/org/opendata/curation/d4/column/ExpandedColumnStatsWriter.java +++ b/src/main/java/org/opendata/curation/d4/column/ExpandedColumnStatsWriter.java @@ -116,7 +116,7 @@ public static void main(String[] args) { File outputFile = new File(args[1]); try (PrintWriter out = FileSystem.openPrintWriter(outputFile)) { - ExpandedColumnIndex columnIndex = new ExpandedColumnIndex(); + ExpandedColumnIndex columnIndex = new ExpandedColumnIndex(outputFile); new ExpandedColumnReader(columnsFile).stream(columnIndex); for (ExpandedColumn column : columnIndex.columns()) { int colCount = columnIndex.columns(column.id()).length(); diff --git a/src/main/java/org/opendata/curation/d4/column/ExpandedColumnWriterFactory.java b/src/main/java/org/opendata/curation/d4/column/ExpandedColumnWriterFactory.java deleted file mode 100644 index d305b1b..0000000 --- a/src/main/java/org/opendata/curation/d4/column/ExpandedColumnWriterFactory.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * This file is part of the Data-Driven Domain Discovery Tool (D4). - * - * Copyright (c) 2018-2020 New York University. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.opendata.curation.d4.column; - -import java.io.File; -import java.util.HashMap; -import org.opendata.core.io.FileSystem; -import org.opendata.core.set.HashIDSet; - -/** - * - * @author Heiko Mueller - */ -public class ExpandedColumnWriterFactory implements ExpandedColumnConsumerFactory { - - private int _count = 0; - private final File _file; - private ExpandedColumnWriter _globalWriter = null; - private final boolean _outputToDir; - - public ExpandedColumnWriterFactory(File file, boolean outputToDir) { - - _file = file; - _outputToDir = outputToDir; - - if (outputToDir) { - FileSystem.createFolder(file); - } else { - FileSystem.createParentFolder(file); - } - } - - @Override - public synchronized ExpandedColumnConsumer getConsumer(HashMap groups) { - - if (_outputToDir) { - String filename = "expanded-columns." + (_count++) + ".txt.gz"; - File outputFile = FileSystem.joinPath(_file, filename); - return new ExpandedColumnWriter(outputFile, groups); - } else { - if (_globalWriter == null) { - _globalWriter = new ExpandedColumnWriter(_file, groups); - } - return _globalWriter; - } - } -} diff --git a/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java b/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java index c985b94..f6736a9 100644 --- a/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java +++ b/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java @@ -17,6 +17,7 @@ */ package org.opendata.curation.d4.column; +import java.io.File; import java.math.BigDecimal; import java.util.ArrayList; import java.util.Collections; @@ -197,7 +198,7 @@ public void run( int numberOfIterations, int threads, boolean verbose, - ExpandedColumnConsumerFactory consumerFactory + File outputFile ) { HashMap columnIndex = new HashMap<>(); HashMap groups = new HashMap<>(); @@ -233,18 +234,24 @@ public void run( System.out.println( String.format( "EXPAND %d COLUMNS IN %d GROUPS USING:\n" + + " --eqs=%s\n" + + " --signatures=%s\n" + " --trimmer=%s\n" + " --expandThreshold=%s\n" + " --decrease=%s\n" + " --iterations=%d\n" + - " --threads=%d", + " --threads=%d\n" + + " --columns=%s", db.length(), columnList.size(), + nodes.source(), + signatures.source(), trimmer, threshold.toPlainString(), decreaseFactor.toPlainString(), numberOfIterations, - threads + threads, + outputFile.getName() ) ); LOGGER.log(Level.INFO, String.format("START @ %s", start)); @@ -270,7 +277,7 @@ public void run( decreaseFactor, numberOfIterations, verbose, - consumerFactory.getConsumer(groups) + new ExpandedColumnWriter(outputFile, groups) ); es.execute(expander); } diff --git a/src/main/java/org/opendata/curation/d4/domain/DomainConsumer.java b/src/main/java/org/opendata/curation/d4/domain/DomainConsumer.java index fca64ac..6b9156f 100644 --- a/src/main/java/org/opendata/curation/d4/domain/DomainConsumer.java +++ b/src/main/java/org/opendata/curation/d4/domain/DomainConsumer.java @@ -17,12 +17,14 @@ */ package org.opendata.curation.d4.domain; +import org.opendata.core.io.prov.DataSink; + /** * Consumer interface for domain streams. * * @author Heiko Mueller */ -public interface DomainConsumer { +public interface DomainConsumer extends DataSink { public void close(); public void consume(Domain domain); diff --git a/src/main/java/org/opendata/curation/d4/domain/DomainSetStatsPrinter.java b/src/main/java/org/opendata/curation/d4/domain/DomainSetStatsPrinter.java index 26cc580..45153f9 100644 --- a/src/main/java/org/opendata/curation/d4/domain/DomainSetStatsPrinter.java +++ b/src/main/java/org/opendata/curation/d4/domain/DomainSetStatsPrinter.java @@ -85,6 +85,12 @@ public void print() { System.out.println("AVG. COLUMNS PER DOMAIN: " + new FormatedBigDecimal((double)_domainColumnCount/(double)_domainCount)); } } + + @Override + public String target() { + + return "System.out"; + } private static final String COMMAND = "Usage:\n" + diff --git a/src/main/java/org/opendata/curation/d4/domain/DomainWriter.java b/src/main/java/org/opendata/curation/d4/domain/DomainWriter.java index 74e49a8..cc35e5b 100644 --- a/src/main/java/org/opendata/curation/d4/domain/DomainWriter.java +++ b/src/main/java/org/opendata/curation/d4/domain/DomainWriter.java @@ -66,4 +66,10 @@ public void open() { throw new RuntimeException(ex); } } + + @Override + public String target() { + + return _file.getName(); + } } diff --git a/src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/ExternalMemLocalDomainGenerator.java similarity index 91% rename from src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java rename to src/main/java/org/opendata/curation/d4/domain/ExternalMemLocalDomainGenerator.java index 1a2819f..f4fcea2 100644 --- a/src/main/java/org/opendata/curation/d4/domain/SingleScanLocalDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/ExternalMemLocalDomainGenerator.java @@ -48,7 +48,7 @@ * * @author Heiko Mueller */ -public class SingleScanLocalDomainGenerator { +public class ExternalMemLocalDomainGenerator { public static final String TELEMETRY_ID = "LOCAL DOMAINS"; @@ -120,12 +120,12 @@ public void run() { private final TelemetryCollector _telemetry; - public SingleScanLocalDomainGenerator(TelemetryCollector telemetry) { + public ExternalMemLocalDomainGenerator(TelemetryCollector telemetry) { _telemetry = telemetry; } - public SingleScanLocalDomainGenerator() { + public ExternalMemLocalDomainGenerator() { this(new TelemetryPrinter()); } @@ -155,15 +155,23 @@ public void run( if (verbose) { System.out.println( String.format( - "LOCAL DOMAINS FOR %d COLUMN GROUPS USING:\n" + + "LOCAL DOMAINS (EXTERNAL MEMORY) FOR %d COLUMN GROUPS USING:\n" + + " --eqs=%s\n" + + " --columns=%s\n" + + " --signatures=%s\n" + " --trimmer=%s\n" + " --originalonly=%s\n" + " --threads=%d\n" + - " --singlescan=false", + " --inmem=false\n" + + " --localdomains=%s", columnList.size(), + nodes.source(), + columnIndex.source(), + signatures.source(), trimmer, Boolean.toString(originalOnly), - threads + threads, + consumer.target() ) ); System.out.println("START @ " + start); diff --git a/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/InMemLocalDomainGenerator.java similarity index 91% rename from src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java rename to src/main/java/org/opendata/curation/d4/domain/InMemLocalDomainGenerator.java index b8e02d0..68e113d 100644 --- a/src/main/java/org/opendata/curation/d4/domain/MultiScanLocalDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/InMemLocalDomainGenerator.java @@ -48,7 +48,7 @@ * * @author Heiko Mueller */ -public class MultiScanLocalDomainGenerator { +public class InMemLocalDomainGenerator { public static final String TELEMETRY_ID = "LOCAL DOMAINS"; @@ -119,12 +119,12 @@ public void run() { private final TelemetryCollector _telemetry; - public MultiScanLocalDomainGenerator(TelemetryCollector telemetry) { + public InMemLocalDomainGenerator(TelemetryCollector telemetry) { _telemetry = telemetry; } - public MultiScanLocalDomainGenerator() { + public InMemLocalDomainGenerator() { this(new TelemetryPrinter()); } @@ -166,15 +166,23 @@ public int compare(ExpandedColumn col1, ExpandedColumn col2) { if (verbose) { System.out.println( String.format( - "LOCAL DOMAINS FOR %d COLUMN GROUPS USING:\n" + + "LOCAL DOMAINS (IN MEMORY) FOR %d COLUMN GROUPS USING:\n" + + " --eqs=%s\n" + + " --columns=%s\n" + + " --signatures=%s\n" + " --trimmer=%s\n" + " --originalonly=%s\n" + " --threads=%d\n" + - " --singlescan=false", + " --inmem=true\n" + + " --localdomains=%s", columnList.size(), + nodes.source(), + columnIndex.source(), + signatures.source(), trimmer, Boolean.toString(originalOnly), - threads + threads, + consumer.target() ) ); System.out.println(String.format("START @ %s", start)); diff --git a/src/main/java/org/opendata/curation/d4/domain/StrongDomain.java b/src/main/java/org/opendata/curation/d4/domain/StrongDomain.java index 400ac9e..5ffec2f 100644 --- a/src/main/java/org/opendata/curation/d4/domain/StrongDomain.java +++ b/src/main/java/org/opendata/curation/d4/domain/StrongDomain.java @@ -17,9 +17,17 @@ */ package org.opendata.curation.d4.domain; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import org.opendata.core.object.IdentifiableDouble; import org.opendata.core.object.IdentifiableObjectImpl; +import org.opendata.core.prune.MaxDropFinder; +import org.opendata.core.set.HashIDSet; import org.opendata.core.set.IDSet; import org.opendata.core.set.IdentifiableObjectSet; +import org.opendata.core.sort.DoubleValueDescSort; +import org.opendata.db.eq.EQIndex; /** * A strong domain is a set of local domains. Each member node in the strong @@ -59,6 +67,83 @@ public IDSet columns() { return _columns; } + /** + * Get blocks of term identifier for strong domain members. Each term is + * assigned a weight based on the number of local domains it occurs in and + * blocks are generated using steepest drop. + * + * @param eqIndex + * @return + */ + public List> getBlocksWithWeights(EQIndex eqIndex) { + + List nodes = new ArrayList<>(); + for (StrongDomainMember node : this.members()) { + double weight = node.weight().doubleValue(); + nodes.add(new IdentifiableDouble(node.id(), weight)); + Collections.sort(nodes, new DoubleValueDescSort<>()); + } + + MaxDropFinder dropFinder = new MaxDropFinder<>(0.0, true, true); + + ArrayList> blocks = new ArrayList<>(); + + int start = 0; + final int end = nodes.size(); + while (start < end) { + int pruneIndex = dropFinder.getPruneIndex(nodes, start); + if (pruneIndex <= start) { + break; + } + List block = new ArrayList<>(); + for (int iEl = start; iEl < pruneIndex; iEl++) { + IdentifiableDouble node = nodes.get(iEl); + for (int termId : eqIndex.get(node.id()).terms()) { + block.add(new IdentifiableDouble(termId, node.value())); + } + } + blocks.add(block); + start = pruneIndex; + } + return blocks; + } + + /** + * Get blocks of node identifier for strong domain members. + * + * @param eqIndex + * @return + */ + public List getNodeBlocks(EQIndex eqIndex) { + + List nodes = new ArrayList<>(); + for (StrongDomainMember node : this.members()) { + double weight = node.weight().doubleValue(); + nodes.add(new IdentifiableDouble(node.id(), weight)); + Collections.sort(nodes, new DoubleValueDescSort<>()); + } + + MaxDropFinder dropFinder = new MaxDropFinder<>(0.0, true, true); + + ArrayList blocks = new ArrayList<>(); + + int start = 0; + final int end = nodes.size(); + while (start < end) { + int pruneIndex = dropFinder.getPruneIndex(nodes, start); + if (pruneIndex <= start) { + break; + } + HashIDSet block = new HashIDSet(); + for (int iEl = start; iEl < pruneIndex; iEl++) { + block.add(nodes.get(iEl).id()); + } + blocks.add(block); + start = pruneIndex; + } + return blocks; + } + /** * Set of identifier for local domains that compose this strong domain. * diff --git a/src/main/java/org/opendata/curation/d4/experiments/BestGTAllMatch.java b/src/main/java/org/opendata/curation/d4/experiments/BestGTAllMatch.java deleted file mode 100644 index 20a2b05..0000000 --- a/src/main/java/org/opendata/curation/d4/experiments/BestGTAllMatch.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * This file is part of the Data-Driven Domain Discovery Tool (D4). - * - * Copyright (c) 2018-2020 New York University. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.opendata.curation.d4.experiments; - -import java.io.File; -import java.math.BigDecimal; -import java.util.List; -import java.util.logging.Level; -import java.util.logging.Logger; -import org.opendata.core.metric.F1; -import org.opendata.core.metric.Precision; -import org.opendata.core.metric.Recall; -import org.opendata.core.set.HashIDSet; -import org.opendata.core.set.IDSet; -import org.opendata.core.util.FormatedBigDecimal; - -/** - * Find best match of discovered strong domains with ground-truth domains over - * all strong domain block sub-sequences. - * - * @author Heiko Mueller - */ -public class BestGTAllMatch { - - private static final String COMMAND = - "Usage:\n" + - " \n" + - " "; - - private static final Logger LOGGER = Logger.getLogger(BestGTAllMatch.class.getName()); - - public static void main(String[] args) { - - if (args.length != 2) { - System.out.println(COMMAND); - System.exit(-1); - } - - File gtDir = new File(args[0]); - File domainDir = new File(args[1]); - - System.out.println("DOMAIN\tID\tPRECISION\tRECALL\tF1"); - - try { - List> domains; - domains = new StrongDomainJsonReader().readAllBlocks(domainDir); - for (File file : gtDir.listFiles()) { - String name = file.getName().substring(0, file.getName().indexOf(".")); - IDSet gt = new GTReader().read(file); - BigDecimal[] bestMatch = new BigDecimal[]{ - BigDecimal.ZERO, - BigDecimal.ZERO, - BigDecimal.ZERO - }; - for (List domainBlocks : domains) { - HashIDSet domain = new HashIDSet(); - for (IDSet block : domainBlocks) { - domain.add(block); - int ovp = domain.overlap(gt); - if (ovp > 0) { - Precision precision = new Precision(ovp, domain.length()); - Recall recall = new Recall(ovp, gt.length()); - BigDecimal f1 = new F1(precision, recall).value(); - if (bestMatch[2].compareTo(f1) < 0) { - bestMatch = new BigDecimal[]{ - precision.value(), - recall.value(), - f1 - }; - } - } - } - } - System.out.println( - String.format( - "%s\t%s\t%s\t%s", - name, - new FormatedBigDecimal(bestMatch[0]), - new FormatedBigDecimal(bestMatch[1]), - new FormatedBigDecimal(bestMatch[2]) - ) - ); - } - } catch (java.io.IOException ex) { - LOGGER.log(Level.SEVERE, "RUN", ex); - System.exit(-1); - } - } -} diff --git a/src/main/java/org/opendata/curation/d4/experiments/BestGTStrongMatch.java b/src/main/java/org/opendata/curation/d4/experiments/BestGTStrongMatch.java new file mode 100644 index 0000000..96b508b --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/experiments/BestGTStrongMatch.java @@ -0,0 +1,191 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.experiments; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.opendata.core.metric.F1; +import org.opendata.core.metric.Precision; +import org.opendata.core.metric.Recall; +import org.opendata.core.set.HashIDSet; +import org.opendata.core.set.IDSet; +import org.opendata.core.set.IdentifiableObjectSet; +import org.opendata.curation.d4.domain.StrongDomain; +import org.opendata.curation.d4.domain.StrongDomainReader; +import org.opendata.db.eq.EQIndex; + +/** + * Find best match of discovered strong domains with ground-truth domains over + * all strong domain block sub-sequences. + * + * @author Heiko Mueller + */ +public class BestGTStrongMatch { + + private void match( + int domainId, + IDSet terms, + IDSet gt, + BestMatch[] matches, + List context + ) { + + int ovp = terms.overlap(gt); + if (ovp > 0) { + Precision precision = new Precision(ovp, terms.length()); + Recall recall = new Recall(ovp, gt.length()); + F1 f1 = new F1(precision, recall); + for (int iMatch : context) { + if (matches[iMatch].f1().compareTo(f1) < 0) { + matches[iMatch] = new BestMatch(domainId, precision, recall); + } + } + } + } + + public void run( + EQIndex eqIndex, + IdentifiableObjectSet strongDomains, + HashMap groundTruths + ) { + HashMap bestMatches = new HashMap<>(); + for (String name : groundTruths.keySet()) { + bestMatches.put( + name, + new BestMatch[]{new BestMatch(), new BestMatch(), new BestMatch()} + ); + } + + for (StrongDomain domain : strongDomains) { + List blocks; + blocks = domain.getNodeBlocks(eqIndex); + HashIDSet terms = new HashIDSet(); + for (int iBlock = 0; iBlock < blocks.size(); iBlock++) { + List context = new ArrayList<>(); + if (iBlock == 0) { + context.add(0); + } + context.add(1); + if (iBlock == blocks.size() - 1) { + context.add(2); + } + IDSet block = blocks.get(iBlock); + if ((terms.length() + block.length()) > 10000) { + break; + } + int blockSize = 0; + for (int nodeId : block) { + blockSize += eqIndex.get(nodeId).terms().length(); + } + if ((terms.length() + blockSize) > 10000) { + break; + } + for (int nodeId : block) { + terms.add(eqIndex.get(nodeId).terms()); + } + for (String name : groundTruths.keySet()) { + IDSet gt = groundTruths.get(name); + this.match(domain.id(), terms, gt, bestMatches.get(name), context); + } + } + } + + System.out.print("DOMAIN"); + for (String key : new String[]{"1", "?", "n"}) { + System.out.print( + String.format( + "\tID (%s)\tPRECISION (%s)\tRECALL (%s)\tF1 (%s)", + key, + key, + key, + key + ) + ); + } + System.out.println(); + + List names = new ArrayList<>(bestMatches.keySet()); + Collections.sort(names); + + for (String name : names) { + System.out.print(name); + for (BestMatch match : bestMatches.get(name)) { + System.out.print( + String.format( + "\t%d\t%s\t%s\t%s", + match.domainId(), + match.precision().toString(), + match.recall().toString(), + match.f1().toString() + ) + ); + } + System.out.println(); + } + } + + private static final String COMMAND = + "Usage:\n" + + " \n" + + " \n" + + " "; + + private static final Logger LOGGER = Logger + .getLogger(BestGTStrongMatch.class.getName()); + + public static void main(String[] args) { + + if (args.length != 3) { + System.out.println(COMMAND); + System.exit(-1); + } + + File eqFile = new File(args[0]); + File gtDir = new File(args[1]); + File strongDomainFile = new File(args[2]); + + // Read ground truth domains. + HashMap groundTruths = new HashMap<>(); + try { + for (File file : gtDir.listFiles()) { + String name = file.getName().substring(0, file.getName().indexOf(".")); + groundTruths.put(name, new GTReader().read(file)); + } + } catch (java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "READ GT", ex); + System.exit(-1); + } + + try { + new BestGTStrongMatch() + .run( + new EQIndex(eqFile), + new StrongDomainReader(strongDomainFile).read(), + groundTruths + ); + } catch (java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "RUN", ex); + System.exit(-1); + } + } +} diff --git a/src/main/java/org/opendata/curation/d4/export/ExportStrongDomains.java b/src/main/java/org/opendata/curation/d4/export/ExportStrongDomains.java index e24296c..8e1f0de 100644 --- a/src/main/java/org/opendata/curation/d4/export/ExportStrongDomains.java +++ b/src/main/java/org/opendata/curation/d4/export/ExportStrongDomains.java @@ -33,10 +33,8 @@ import java.util.logging.Logger; import org.opendata.core.io.FileSystem; import org.opendata.core.object.IdentifiableDouble; -import org.opendata.core.prune.MaxDropFinder; import org.opendata.core.set.HashIDSet; import org.opendata.core.set.IdentifiableObjectSet; -import org.opendata.core.sort.DoubleValueDescSort; import org.opendata.core.util.StringHelper; import org.opendata.core.util.count.Counter; import org.opendata.curation.d4.Constants; @@ -60,28 +58,6 @@ */ public class ExportStrongDomains { - private List> getBlocks(List items) { - - MaxDropFinder dropFinder = new MaxDropFinder<>(0.0, true, true); - - int start = 0; - final int end = items.size(); - ArrayList> blocks = new ArrayList<>(); - while (start < end) { - int pruneIndex = dropFinder.getPruneIndex(items, start); - if (pruneIndex <= start) { - break; - } - List block = new ArrayList<>(); - for (int iEl = start; iEl < pruneIndex; iEl++) { - block.add(items.get(iEl)); - } - blocks.add(block); - start = pruneIndex; - } - return blocks; - } - private String getDomainName(List names) { HashMap tokens = new HashMap<>(); @@ -203,15 +179,7 @@ public void run( names.add(columnInfo[0]); } String domainName = this.getDomainName(names); - List items = new ArrayList<>(); - for (StrongDomainMember node : domain.members()) { - double weight = node.weight().doubleValue(); - for (int termId : eqIndex.get(node.id()).terms()) { - items.add(new IdentifiableDouble(termId, weight)); - } - Collections.sort(items, new DoubleValueDescSort<>()); - } - List> blocks = this.getBlocks(items); + List> blocks = domain.getBlocksWithWeights(eqIndex); JsonArray arrTerms = new JsonArray(); for (List block : blocks) { JsonArray arrBlock = new JsonArray(); diff --git a/src/main/java/org/opendata/curation/d4/signature/RobustSignatureIndex.java b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureIndex.java deleted file mode 100644 index f73799b..0000000 --- a/src/main/java/org/opendata/curation/d4/signature/RobustSignatureIndex.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * This file is part of the Data-Driven Domain Discovery Tool (D4). - * - * Copyright (c) 2018-2020 New York University. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.opendata.curation.d4.signature; - -import org.opendata.core.set.HashObjectSet; - -/** - * Maintain an index of robust signatures in main memory. - * - * @author Heiko Mueller - */ -public class RobustSignatureIndex implements SignatureBlocksConsumer, SignatureBlocksStream { - - private final HashObjectSet _elements = new HashObjectSet<>(); - - @Override - public void close() { - - } - - public boolean contains(int id) { - - return _elements.contains(id); - } - - @Override - public synchronized void consume(SignatureBlocks sig) { - - _elements.add(sig); - } - - public SignatureBlocks get(int id) { - - return _elements.get(id); - } - - @Override - public boolean isDone() { - - return false; - } - - @Override - public void open() { - - } - - @Override - public void stream(SignatureBlocksConsumer consumer) { - - consumer.open(); - - for (SignatureBlocks sig : _elements) { - consumer.consume(sig); - } - consumer.close(); - } -} diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksBuffer.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksBuffer.java index 8d25bbb..7319ce4 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksBuffer.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksBuffer.java @@ -17,14 +17,9 @@ */ package org.opendata.curation.d4.signature; -import java.io.File; import java.util.ArrayList; -import java.util.Date; import java.util.Iterator; import java.util.List; -import java.util.logging.Level; -import java.util.logging.Logger; -import org.opendata.core.util.MemUsagePrinter; /** * Memory buffer for signature blocks. @@ -34,6 +29,12 @@ public class SignatureBlocksBuffer implements Iterable, SignatureBlocksConsumer, SignatureBlocksStream { private final List _signatures = new ArrayList<>(); + private final String _source; + + public SignatureBlocksBuffer(String source) { + + _source = source; + } @Override public void close() { @@ -85,38 +86,10 @@ public void stream(SignatureBlocksConsumer consumer) { consumer.close(); } - - private static final String COMMAND = - "Usage:\n" + - " "; - - private static final Logger LOGGER = Logger - .getLogger(SignatureBlocksBuffer.class.getName()); - - public static void main(String[] args) { - - if (args.length != 1) { - System.out.println(COMMAND); - System.exit(-1); - } - - File signaturesFileOrDir = new File(args[0]); - - SignatureBlocksBuffer buffer = new SignatureBlocksBuffer(); - - Date start = new Date(); - try { - new SignatureBlocksReader(signaturesFileOrDir, true).stream(buffer); - } catch (java.io.IOException ex) { - LOGGER.log(Level.SEVERE, "RUN", ex); - System.exit(-1); - } - Date end = new Date(); - - long execTime = end.getTime() - start.getTime(); - - new MemUsagePrinter().print(); - - System.out.println("\n\nREAD " + buffer.size() + " SIGNATURES IN " + execTime + " ms"); + + @Override + public String source() { + + return _source; } } diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndex.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndex.java index 3568a02..dd9dc58 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndex.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndex.java @@ -30,6 +30,12 @@ public class SignatureBlocksIndex implements Iterable, SignatureBlocksConsumer, SignatureBlocksStream { private final HashObjectSet _signatures = new HashObjectSet<>(); + private final String _source; + + public SignatureBlocksIndex(String source) { + + _source = source; + } public void clear() { @@ -101,6 +107,12 @@ public void stream(SignatureBlocksConsumer consumer, IDSet filter) { consumer.close(); } + + @Override + public String source() { + + return _source; + } public List toList() { diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksReader.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksReader.java index b792996..e06ca09 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksReader.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksReader.java @@ -33,9 +33,13 @@ */ public class SignatureBlocksReader extends FileSetReader implements SignatureBlocksStream { + private final File _file; + public SignatureBlocksReader(File file, boolean verbose) { super(file, verbose); + + _file = file; } public SignatureBlocksReader(File file) { @@ -43,9 +47,11 @@ public SignatureBlocksReader(File file) { this(file, false); } - public SignatureBlocksReader(List files) { + public SignatureBlocksReader(List files, File directory) { super(files, false); + + _file = directory; } public static int[] getBlockNodes(String text) { @@ -66,7 +72,7 @@ public static int[] getBlockNodes(String text) { public SignatureBlocksIndex read() throws java.io.IOException { - SignatureBlocksIndex buffer = new SignatureBlocksIndex(); + SignatureBlocksIndex buffer = new SignatureBlocksIndex(this.source()); this.stream(buffer); return buffer; } @@ -97,4 +103,10 @@ public void stream(SignatureBlocksConsumer consumer) throws java.io.IOException consumer.close(); } + + @Override + public String source() { + + return _file.getName(); + } } diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStream.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStream.java index fff4650..f55c09a 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStream.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStream.java @@ -17,12 +17,14 @@ */ package org.opendata.curation.d4.signature; +import org.opendata.core.io.prov.DataCollection; + /** * Stream of signature blocks. * * @author Heiko Mueller */ -public interface SignatureBlocksStream { +public interface SignatureBlocksStream extends DataCollection { public void stream(SignatureBlocksConsumer consumer) throws java.io.IOException; } diff --git a/src/main/java/org/opendata/db/column/ColumnPrinter.java b/src/main/java/org/opendata/db/column/ColumnPrinter.java new file mode 100644 index 0000000..7c74403 --- /dev/null +++ b/src/main/java/org/opendata/db/column/ColumnPrinter.java @@ -0,0 +1,70 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.db.column; + +import java.io.File; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.opendata.core.io.EntitySetReader; +import org.opendata.core.set.EntitySet; +import org.opendata.core.set.IDSet; +import org.opendata.db.eq.EQIndex; + +/** + * Print nodes and terms in a database column. + * + * @author Heiko Mueller + */ +public class ColumnPrinter { + + private final static String COMMAND = + "Usage:\n" + + " \n" + + " \n" + + " "; + + private final static Logger LOGGER = Logger + .getLogger(ColumnPrinter.class.getName()); + + public static void main(String[] args) { + + if (args.length != 3) { + System.out.println(COMMAND); + System.exit(-1); + } + + File eqFile = new File(args[0]); + File termFile = new File(args[1]); + int columnId = Integer.parseInt(args[2]); + + try { + EQIndex eqIndex = new EQIndex(eqFile); + IDSet column = eqIndex.columns().get(columnId); + EntitySet terms = new EntitySetReader(termFile).readEntities(eqIndex, column); + for (int nodeId : column) { + int[] termIds = eqIndex.get(nodeId).terms().toArray(); + System.out.println(nodeId + "\t" + terms.get(termIds[0]).name()); + for (int i = 1; i < termIds.length; i++) { + System.out.println("\t" + terms.get(termIds[i]).name()); + } + } + } catch (java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "RUN", ex); + } + } +} diff --git a/src/main/java/org/opendata/db/eq/EQIndex.java b/src/main/java/org/opendata/db/eq/EQIndex.java index 7deebd7..511f929 100644 --- a/src/main/java/org/opendata/db/eq/EQIndex.java +++ b/src/main/java/org/opendata/db/eq/EQIndex.java @@ -20,6 +20,7 @@ import java.io.File; import java.io.PrintWriter; import org.opendata.core.io.FileSystem; +import org.opendata.core.io.prov.DataCollection; import org.opendata.core.prune.SizeFunction; import org.opendata.core.set.HashObjectSet; import org.opendata.core.set.IdentifiableIDSet; @@ -34,18 +35,23 @@ * * @author Heiko Mueller */ -public class EQIndex extends HashObjectSet implements EQStream, SizeFunction { +public class EQIndex extends HashObjectSet implements DataCollection, EQStream, SizeFunction { + private final File _file; private int[] _nodeSizes = null; public EQIndex(File eqFile, EQFactory factory) throws java.io.IOException { super(new EQReader(eqFile, factory).read()); + + _file = eqFile; } public EQIndex(File eqFile) throws java.io.IOException { super(new EQReader(eqFile).read()); + + _file = eqFile; } public int[] columnSizes() { @@ -72,7 +78,7 @@ public IdentifiableObjectSet columns() { } return columns; } - + @Override public int getSize(int id) { @@ -98,6 +104,12 @@ public int[] nodeSizes() { } return _nodeSizes; } + + @Override + public String source() { + + return _file.getName(); + } /** * Distribute the identifier of all columns in the database across a given From 4dcdbf40aa358add5a65fdc28daf8d90d71f29b0 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Sun, 6 Dec 2020 12:26:41 -0500 Subject: [PATCH 18/25] Remove dupliated output --- .../d4/domain/ExternalMemLocalDomainGenerator.java | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/main/java/org/opendata/curation/d4/domain/ExternalMemLocalDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/ExternalMemLocalDomainGenerator.java index f4fcea2..f0ae266 100644 --- a/src/main/java/org/opendata/curation/d4/domain/ExternalMemLocalDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/ExternalMemLocalDomainGenerator.java @@ -179,26 +179,12 @@ public void run( ExecutorService es = Executors.newCachedThreadPool(); - if (verbose) { - System.out.println( - String.format( - "LOCAL DOMAINS FOR %d COLUMN GROUPS USING:\n" + - " --trimmer=%s\n" + - " --threads=%d", - columnList.size(), - trimmer, - threads - ) - ); - } - SignatureTrimmerFactory trimmerFactory = new SignatureTrimmerFactory( nodes, columnIndex.toColumns(originalOnly), trimmer ); - for (int iThread = 0; iThread < threads; iThread++) { List columns = new ArrayList<>(); for (int iCol = iThread; iCol < columnList.size(); iCol += threads) { From 33ba76529715bc73702e5762418510210b14bbb8 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Sun, 6 Dec 2020 12:48:36 -0500 Subject: [PATCH 19/25] Add min support as parameter for strong domain generation --- .../org/opendata/curation/d4/Constants.java | 2 +- .../java/org/opendata/curation/d4/D4.java | 14 +++++---- .../curation/d4/domain/DomainReader.java | 10 ++++++- .../d4/domain/StrongDomainConsumer.java | 3 +- .../d4/domain/StrongDomainGenerator.java | 29 ++++++++++++++----- .../d4/domain/StrongDomainWriter.java | 6 ++++ 6 files changed, 48 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/opendata/curation/d4/Constants.java b/src/main/java/org/opendata/curation/d4/Constants.java index b8dbef1..8cf3c11 100644 --- a/src/main/java/org/opendata/curation/d4/Constants.java +++ b/src/main/java/org/opendata/curation/d4/Constants.java @@ -26,5 +26,5 @@ public final class Constants { public static final String NAME = "D4 - Data-Driven Domain Discovery"; - public static final String VERSION = "0.29.0.dev12"; + public static final String VERSION = "0.29.0.dev13"; } \ No newline at end of file diff --git a/src/main/java/org/opendata/curation/d4/D4.java b/src/main/java/org/opendata/curation/d4/D4.java index e6c6151..af6df1c 100644 --- a/src/main/java/org/opendata/curation/d4/D4.java +++ b/src/main/java/org/opendata/curation/d4/D4.java @@ -202,6 +202,7 @@ public void strongDomains( EQIndex nodeIndex, File localDomainFile, Threshold domainOverlapConstraint, + Threshold minSupportConstraint, BigDecimal supportFraction, int threads, boolean verbose, @@ -209,17 +210,15 @@ public void strongDomains( File outputFile ) throws java.lang.InterruptedException, java.io.IOException { - IdentifiableObjectSet localDomains; - localDomains = new DomainReader(localDomainFile).read(); new StrongDomainGenerator(telemetry).run( nodeIndex, - localDomains, + new DomainReader(localDomainFile), domainOverlapConstraint, - Threshold.getConstraint("GT0.1"), + minSupportConstraint, supportFraction, verbose, threads, - new StrongDomainWriter(outputFile, localDomains) + outputFile ); if (verbose) { @@ -534,6 +533,7 @@ public static void main(String[] args) { " [default: 'local-domains.txt.gz']" ), new Parameter("domainOverlap", " [default: 'GT0.5']"), + new Parameter("minSupport", " [default: 'GT0.1']"), new Parameter("supportFraction", " [default: 0.25]"), new Parameter("threads", " [default: 6]"), new Parameter("verbose", " [default: true]"), @@ -546,7 +546,8 @@ public static void main(String[] args) { ); File eqFile = params.getAsFile("eqs", "compressed-term-index.txt.gz"); File localDomainFile = params.getAsFile("localdomains", "local-domains.txt.gz"); - Threshold domainOverlapConstraint = params.getAsConstraint(UNKNOWN, "GT0.5"); + Threshold domainOverlapConstraint = params.getAsConstraint("domainOverlap", "GT0.5"); + Threshold minSupportConstraint = params.getAsConstraint("minSupport", "GT0.1"); BigDecimal supportFraction = params .getAsBigDecimal("supportFraction", new BigDecimal("0.25")); int threads = params.getAsInt("threads", 6); @@ -557,6 +558,7 @@ public static void main(String[] args) { new EQIndex(eqFile), localDomainFile, domainOverlapConstraint, + minSupportConstraint, supportFraction, threads, verbose, diff --git a/src/main/java/org/opendata/curation/d4/domain/DomainReader.java b/src/main/java/org/opendata/curation/d4/domain/DomainReader.java index 9622ad9..73ace0a 100644 --- a/src/main/java/org/opendata/curation/d4/domain/DomainReader.java +++ b/src/main/java/org/opendata/curation/d4/domain/DomainReader.java @@ -20,6 +20,7 @@ import java.io.BufferedReader; import java.io.File; import org.opendata.core.io.FileSystem; +import org.opendata.core.io.prov.DataCollection; import org.opendata.core.object.filter.AnyObjectFilter; import org.opendata.core.object.filter.ObjectFilter; import org.opendata.core.set.HashIDSet; @@ -33,7 +34,7 @@ * * @author Heiko Mueller */ -public class DomainReader implements DomainStream { +public class DomainReader implements DataCollection, DomainStream { private final File _file; @@ -88,6 +89,13 @@ public IdentifiableObjectSet read(ObjectFilter filter) throws j return result; } + + @Override + public String source() { + + return _file.getName(); + } + @Override public void stream(DomainConsumer consumer) throws java.io.IOException { diff --git a/src/main/java/org/opendata/curation/d4/domain/StrongDomainConsumer.java b/src/main/java/org/opendata/curation/d4/domain/StrongDomainConsumer.java index 771be9b..af0f256 100644 --- a/src/main/java/org/opendata/curation/d4/domain/StrongDomainConsumer.java +++ b/src/main/java/org/opendata/curation/d4/domain/StrongDomainConsumer.java @@ -17,6 +17,7 @@ */ package org.opendata.curation.d4.domain; +import org.opendata.core.io.prov.DataSink; import org.opendata.core.set.IdentifiableIDSet; /** @@ -25,7 +26,7 @@ * * @author Heiko Mueller */ -public interface StrongDomainConsumer { +public interface StrongDomainConsumer extends DataSink { /** * Signal the end of the output stream. diff --git a/src/main/java/org/opendata/curation/d4/domain/StrongDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/StrongDomainGenerator.java index c89e4c0..529cae9 100644 --- a/src/main/java/org/opendata/curation/d4/domain/StrongDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/StrongDomainGenerator.java @@ -17,6 +17,7 @@ */ package org.opendata.curation.d4.domain; +import java.io.File; import java.math.BigDecimal; import java.util.Date; import java.util.HashMap; @@ -227,39 +228,50 @@ public StrongDomainGenerator() { * Compute set local domains that have sufficient support. * * @param nodes - * @param localDomains + * @param domainReader * @param domainOverlapConstraint * @param minSupportConstraint * @param supportFraction * @param verbose * @param threads - * @param consumer + * @param outputFile * @throws java.lang.InterruptedException * @throws java.io.IOException */ public void run( EQIndex nodes, - IdentifiableObjectSet localDomains, + DomainReader domainReader, Threshold domainOverlapConstraint, Threshold minSupportConstraint, BigDecimal supportFraction, boolean verbose, int threads, - StrongDomainConsumer consumer + File outputFile ) throws java.lang.InterruptedException, java.io.IOException { + + IdentifiableObjectSet localDomains; + localDomains = domainReader.read(); Date start = new Date(); if (verbose) { System.out.println( String.format( - "STRING DOMAINS FOR %d LOCAL DOMAINS USING:\n" + + "STRONG DOMAINS FOR %d LOCAL DOMAINS USING:\n" + + " --eqs=%s\n" + + " --localdomains=%s\n" + " --domainOverlap=%s\n" + + " --minSupport=%s\n" + " --supportFraction=%s\n" + - " --threads=%d", + " --threads=%d\n" + + " --strongdomains=%s", localDomains.length(), + nodes.source(), + domainReader.source(), domainOverlapConstraint.toPlainString(), + minSupportConstraint.toPlainString(), supportFraction.toPlainString(), - threads + threads, + outputFile.getName() ) ); } @@ -343,6 +355,9 @@ public void run( } } + StrongDomainWriter consumer; + consumer = new StrongDomainWriter(outputFile, localDomains); + consumer.open(); for (IdentifiableIDSet strongDomain : supportGraph.getComponents()) { diff --git a/src/main/java/org/opendata/curation/d4/domain/StrongDomainWriter.java b/src/main/java/org/opendata/curation/d4/domain/StrongDomainWriter.java index 8e5f36b..afcbbb5 100644 --- a/src/main/java/org/opendata/curation/d4/domain/StrongDomainWriter.java +++ b/src/main/java/org/opendata/curation/d4/domain/StrongDomainWriter.java @@ -105,4 +105,10 @@ public void open() { throw new RuntimeException(ex); } } + + @Override + public String target() { + + return _file.getName(); + } } From f92e5f5b7267531dad8b7bd329871eb51421e331 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Wed, 9 Dec 2020 14:45:27 -0500 Subject: [PATCH 20/25] Add option for signature block sketches in column expansion and local domain discovery --- .../org/opendata/curation/d4/Constants.java | 2 +- .../java/org/opendata/curation/d4/D4.java | 42 +++++++++- .../d4/column/ParallelColumnExpander.java | 22 ++++- .../ExternalMemLocalDomainGenerator.java | 10 ++- .../d4/domain/InMemLocalDomainGenerator.java | 11 ++- .../d4/signature/SignatureBlocks.java | 23 ++++++ .../SignatureBlocksNoSketchFactory.java | 41 ++++++++++ .../sketch/SignatureBlocksSizeSketch.java | 81 +++++++++++++++++++ .../SignatureBlocksSizeSketchFactory.java | 48 +++++++++++ .../sketch/SignatureBlocksSketchFactory.java | 46 +++++++++++ 10 files changed, 319 insertions(+), 7 deletions(-) create mode 100644 src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksNoSketchFactory.java create mode 100644 src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSizeSketch.java create mode 100644 src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSizeSketchFactory.java create mode 100644 src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSketchFactory.java diff --git a/src/main/java/org/opendata/curation/d4/Constants.java b/src/main/java/org/opendata/curation/d4/Constants.java index 8cf3c11..8689938 100644 --- a/src/main/java/org/opendata/curation/d4/Constants.java +++ b/src/main/java/org/opendata/curation/d4/Constants.java @@ -26,5 +26,5 @@ public final class Constants { public static final String NAME = "D4 - Data-Driven Domain Discovery"; - public static final String VERSION = "0.29.0.dev13"; + public static final String VERSION = "0.29.0.dev15"; } \ No newline at end of file diff --git a/src/main/java/org/opendata/curation/d4/D4.java b/src/main/java/org/opendata/curation/d4/D4.java index af6df1c..17fede4 100644 --- a/src/main/java/org/opendata/curation/d4/D4.java +++ b/src/main/java/org/opendata/curation/d4/D4.java @@ -45,12 +45,13 @@ import org.opendata.curation.d4.domain.InMemLocalDomainGenerator; import org.opendata.curation.d4.domain.StrongDomain; import org.opendata.curation.d4.domain.StrongDomainReader; -import org.opendata.curation.d4.domain.StrongDomainWriter; import org.opendata.curation.d4.export.ExportStrongDomains; import org.opendata.curation.d4.export.PrimaryDomainWriter; import org.opendata.curation.d4.signature.SignatureBlocksReader; import org.opendata.curation.d4.signature.SignatureBlocksStream; import org.opendata.curation.d4.signature.SignatureBlocksWriter; +import org.opendata.curation.d4.signature.sketch.SignatureBlocksSizeSketchFactory; +import org.opendata.curation.d4.signature.sketch.SignatureBlocksSketchFactory; import org.opendata.curation.d4.signature.trim.SignatureTrimmer; import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; import org.opendata.db.Database; @@ -67,11 +68,16 @@ * @author Heiko Mueller */ public class D4 { + + private static SignatureBlocksSketchFactory SignatureBlocksNoSketchFactory() { + throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. + } public void expandColumns( EQIndex nodeIndex, SignatureBlocksStream signatures, String trimmer, + SignatureBlocksSketchFactory sketchFactory, Threshold expandThreshold, int numberOfIterations, BigDecimal decreaseFactor, @@ -86,6 +92,7 @@ public void expandColumns( nodeIndex, signatures, trimmer, + sketchFactory, db, db.keys(), expandThreshold, @@ -122,11 +129,36 @@ public void exportStrongDomains( ); } + /** + * Factory for sketch factories. Create the instance of the sketch factory + * from the given specification. + * + * If the specification is null the non-sketch factory is returned that does + * not modify the signature blocks. + * + * The only other sketch specification that is currently supported is + * 'Nn' which returns a size threshold sketch generator using n as the + * size threshold. + * + * @param spec + * @return + */ + private static SignatureBlocksSketchFactory getSketchFactory(String spec) { + + if (spec == null) { + return SignatureBlocksNoSketchFactory(); + } else if ((spec.toUpperCase().startsWith("N")) && (spec.length() > 1)) { + return new SignatureBlocksSizeSketchFactory(Integer.parseInt(spec.substring(1))); + } + throw new IllegalArgumentException(String.format("Invalid sketch specification '%s", spec)); + } + public void localDomains( EQIndex nodeIndex, File columnsFile, SignatureBlocksReader signatures, String trimmer, + SignatureBlocksSketchFactory sketchFactory, boolean originalOnly, int threads, boolean inMem, @@ -143,6 +175,7 @@ public void localDomains( columnIndex, signatures.read(), trimmer, + sketchFactory, originalOnly, threads, verbose, @@ -154,6 +187,7 @@ public void localDomains( columnIndex, signatures, trimmer, + sketchFactory, originalOnly, threads, verbose, @@ -429,6 +463,7 @@ public static void main(String[] args) { "trimmer", " [default: " + SignatureTrimmer.CENTRIST + "]" ), + new Parameter("sketch", " [default: null]"), new Parameter("expandThreshold", " [default: 'GT0.25']"), new Parameter("decrease", " [default: 0.05]"), new Parameter("iterations", " [default: 5]"), @@ -441,6 +476,7 @@ public static void main(String[] args) { File eqFile = params.getAsFile("eqs", "compressed-term-index.txt.gz"); File signatureFile = params.getAsFile("signatures", "signatures.txt.gz"); String trimmer = params.getAsString("trimmer", SignatureTrimmer.CENTRIST); + String sketch = params.getAsString("sketch", null); Threshold expandThreshold = params.getAsConstraint("expandThreshold", "GT0.25"); BigDecimal decreaseFactor = params .getAsBigDecimal("decrease", new BigDecimal("0.05")); @@ -453,6 +489,7 @@ public static void main(String[] args) { new EQIndex(eqFile), new SignatureBlocksReader(signatureFile), trimmer, + getSketchFactory(sketch), expandThreshold, numberOfIterations, decreaseFactor, @@ -481,6 +518,7 @@ public static void main(String[] args) { "trimmer", " [default: " + SignatureTrimmer.CENTRIST + "]" ), + new Parameter("sketch", " [default: null]"), new Parameter("originalonly", " [default: false]"), new Parameter("threads", " [default: 6]"), new Parameter("inmem", " [default: false]"), @@ -496,6 +534,7 @@ public static void main(String[] args) { File columnsFile = params.getAsFile("columns", "expanded-columns.txt.gz"); File signatureFile = params.getAsFile("signatures", "signatures.txt.gz"); String trimmer = params.getAsString("trimmer", SignatureTrimmer.CENTRIST); + String sketch = params.getAsString("sketch", null); boolean originalOnly = params.getAsBool("originalonly", false); int threads = params.getAsInt("threads", 6); boolean inMem = params.getAsBool("inmem", false); @@ -507,6 +546,7 @@ public static void main(String[] args) { columnsFile, new SignatureBlocksReader(signatureFile), trimmer, + getSketchFactory(sketch), originalOnly, threads, inMem, diff --git a/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java b/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java index f6736a9..b8723e7 100644 --- a/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java +++ b/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java @@ -39,7 +39,9 @@ import org.opendata.core.set.IDSet; import org.opendata.core.set.IdentifiableObjectSet; import org.opendata.core.util.MemUsagePrinter; +import org.opendata.curation.d4.signature.SignatureBlocksConsumer; import org.opendata.curation.d4.signature.SignatureBlocksDispatcher; +import org.opendata.curation.d4.signature.sketch.SignatureBlocksSketchFactory; import org.opendata.db.column.Column; import org.opendata.db.eq.EQIndex; @@ -66,6 +68,7 @@ private class ExpanderTask implements Runnable { private final EQIndex _nodes; private final int _numberOfIterations; private final SignatureBlocksStream _signatures; + private final SignatureBlocksSketchFactory _sketchFactory; private final Threshold _threshold; private final SignatureTrimmerFactory _trimmerFactory; private final boolean _verbose; @@ -76,6 +79,7 @@ public ExpanderTask( List columns, SignatureBlocksStream signatures, SignatureTrimmerFactory trimmerFactory, + SignatureBlocksSketchFactory sketchFactory, Threshold threshold, BigDecimal decreaseFactor, int numberOfIterations, @@ -87,6 +91,7 @@ public ExpanderTask( _columns = columns; _signatures = signatures; _trimmerFactory = trimmerFactory; + _sketchFactory = sketchFactory; _threshold = threshold; _decreaseFactor = decreaseFactor; _numberOfIterations = numberOfIterations; @@ -139,8 +144,10 @@ public void run() { ) ); } + SignatureBlocksConsumer consumer; + consumer = _sketchFactory.getConsumer(dispatcher); try { - _signatures.stream(dispatcher); + _signatures.stream(consumer); } catch (java.io.IOException ex) { throw new RuntimeException(ex); } @@ -191,6 +198,7 @@ public void run( EQIndex nodes, SignatureBlocksStream signatures, String trimmer, + SignatureBlocksSketchFactory sketchFactory, IdentifiableObjectSet db, IDSet columnFilter, Threshold threshold, @@ -237,6 +245,7 @@ public void run( " --eqs=%s\n" + " --signatures=%s\n" + " --trimmer=%s\n" + + " --sketch=%s\n" + " --expandThreshold=%s\n" + " --decrease=%s\n" + " --iterations=%d\n" + @@ -247,6 +256,7 @@ public void run( nodes.source(), signatures.source(), trimmer, + sketchFactory.toDocString(), threshold.toPlainString(), decreaseFactor.toPlainString(), numberOfIterations, @@ -261,6 +271,11 @@ public void run( SignatureTrimmerFactory trimmerFactory; trimmerFactory = new SignatureTrimmerFactory(nodes, nodes.columns(), trimmer); + ExpandedColumnWriter writer; + writer = new ExpandedColumnWriter(outputFile, groups); + + writer.open(); + ExecutorService es = Executors.newCachedThreadPool(); for (int iThread = 0; iThread < threads; iThread++) { List taskColumns = new ArrayList<>(); @@ -273,11 +288,12 @@ public void run( taskColumns, signatures, trimmerFactory, + sketchFactory, threshold, decreaseFactor, numberOfIterations, verbose, - new ExpandedColumnWriter(outputFile, groups) + writer ); es.execute(expander); } @@ -288,6 +304,8 @@ public void run( throw new RuntimeException(ex); } + writer.close(); + Date end = new Date(); if (verbose) { diff --git a/src/main/java/org/opendata/curation/d4/domain/ExternalMemLocalDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/ExternalMemLocalDomainGenerator.java index f0ae266..4462a2e 100644 --- a/src/main/java/org/opendata/curation/d4/domain/ExternalMemLocalDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/ExternalMemLocalDomainGenerator.java @@ -34,6 +34,7 @@ import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; import org.opendata.core.set.MutableIdentifiableIDSet; import org.opendata.curation.d4.signature.SignatureBlocksDispatcher; +import org.opendata.curation.d4.signature.sketch.SignatureBlocksSketchFactory; import org.opendata.db.eq.EQIndex; /** @@ -59,6 +60,7 @@ private class DomainGeneratorTask implements Runnable { private final int _id; private final EQIndex _nodes; private final SignatureBlocksStream _signatures; + private final SignatureBlocksSketchFactory _sketchFactory; private final SignatureTrimmerFactory _trimmerFactory; private final boolean _verbose; @@ -68,6 +70,7 @@ public DomainGeneratorTask( List columns, SignatureBlocksStream signatures, SignatureTrimmerFactory trimmerFactory, + SignatureBlocksSketchFactory sketchFactory, UniqueDomainSet domains, boolean verbose ) { @@ -76,6 +79,7 @@ public DomainGeneratorTask( _columns = columns; _signatures = signatures; _trimmerFactory = trimmerFactory; + _sketchFactory = sketchFactory; _domains = domains; _verbose = verbose; } @@ -103,7 +107,7 @@ public void run() { Date start = new Date(); try { - _signatures.stream(dispatcher); + _signatures.stream(_sketchFactory.getConsumer(dispatcher)); } catch (java.io.IOException ex) { throw new RuntimeException(ex); } @@ -135,6 +139,7 @@ public void run( ExpandedColumnIndex columnIndex, SignatureBlocksStream signatures, String trimmer, + SignatureBlocksSketchFactory sketchFactory, boolean originalOnly, int threads, boolean verbose, @@ -160,6 +165,7 @@ public void run( " --columns=%s\n" + " --signatures=%s\n" + " --trimmer=%s\n" + + " --sketch=%s\n" + " --originalonly=%s\n" + " --threads=%d\n" + " --inmem=false\n" + @@ -169,6 +175,7 @@ public void run( columnIndex.source(), signatures.source(), trimmer, + sketchFactory.toDocString(), Boolean.toString(originalOnly), threads, consumer.target() @@ -196,6 +203,7 @@ public void run( columns, signatures, trimmerFactory, + sketchFactory, domains, verbose ); diff --git a/src/main/java/org/opendata/curation/d4/domain/InMemLocalDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/InMemLocalDomainGenerator.java index 68e113d..e64f598 100644 --- a/src/main/java/org/opendata/curation/d4/domain/InMemLocalDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/InMemLocalDomainGenerator.java @@ -25,7 +25,6 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; -import java.util.logging.Level; import org.opendata.curation.d4.telemetry.TelemetryCollector; import org.opendata.curation.d4.telemetry.TelemetryPrinter; import org.opendata.curation.d4.column.ExpandedColumn; @@ -36,6 +35,7 @@ import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; import org.opendata.core.set.MutableIdentifiableIDSet; import org.opendata.core.util.MemUsagePrinter; +import org.opendata.curation.d4.signature.sketch.SignatureBlocksSketchFactory; import org.opendata.db.eq.EQIndex; /** @@ -59,6 +59,7 @@ private class DomainGeneratorTask implements Runnable { private final int _id; private final EQIndex _nodes; private final SignatureBlocksStream _signatures; + private final SignatureBlocksSketchFactory _sketchFactory; private final SignatureTrimmerFactory _trimmerFactory; private final boolean _verbose; @@ -68,6 +69,7 @@ public DomainGeneratorTask( ConcurrentLinkedQueue columns, SignatureBlocksStream signatures, SignatureTrimmerFactory trimmerFactory, + SignatureBlocksSketchFactory sketchFactory, UniqueDomainSet domains, boolean verbose ) { @@ -76,6 +78,7 @@ public DomainGeneratorTask( _columns = columns; _signatures = signatures; _trimmerFactory = trimmerFactory; + _sketchFactory = sketchFactory; _domains = domains; _verbose = verbose; } @@ -99,7 +102,7 @@ public void run() { trimmer = _trimmerFactory.getTrimmer(col.id(), domainGenerator); Date runStart = new Date(); try { - _signatures.stream(trimmer); + _signatures.stream(_sketchFactory.getConsumer(trimmer)); } catch (java.io.IOException ex) { throw new RuntimeException(ex); } @@ -134,6 +137,7 @@ public void run( ExpandedColumnIndex columnIndex, SignatureBlocksStream signatures, String trimmer, + SignatureBlocksSketchFactory sketchFactory, boolean originalOnly, int threads, boolean verbose, @@ -171,6 +175,7 @@ public int compare(ExpandedColumn col1, ExpandedColumn col2) { " --columns=%s\n" + " --signatures=%s\n" + " --trimmer=%s\n" + + " --sketch=%s\n" + " --originalonly=%s\n" + " --threads=%d\n" + " --inmem=true\n" + @@ -180,6 +185,7 @@ public int compare(ExpandedColumn col1, ExpandedColumn col2) { columnIndex.source(), signatures.source(), trimmer, + sketchFactory.toDocString(), Boolean.toString(originalOnly), threads, consumer.target() @@ -196,6 +202,7 @@ public int compare(ExpandedColumn col1, ExpandedColumn col2) { queue, signatures, trimmerFactory, + sketchFactory, domains, verbose ); diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocks.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocks.java index 09dccd5..45c0a3a 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocks.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocks.java @@ -39,18 +39,41 @@ public SignatureBlocks(int id, BigDecimal maxSim, int size) { _size = size; } + /** + * Get a block from the signature. + * + * @param index + * @return + */ public abstract int[] get(int index); + /** + * Test if the signature is empty (has no blocks). + * + * @return + */ public boolean isEmpty() { return (_size == 0); } + /** + * Similarity of the first entry in the signature. This is the similarity + * of the most similar term for the equivalence class that is represented + * by this signature. + * + * @return + */ public BigDecimal maxSim() { return _maxSim; } + /** + * Number of blocks in the signature. + * + * @return + */ public int size() { return _size; diff --git a/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksNoSketchFactory.java b/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksNoSketchFactory.java new file mode 100644 index 0000000..47ebde1 --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksNoSketchFactory.java @@ -0,0 +1,41 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.signature.sketch; + +import org.opendata.curation.d4.signature.SignatureBlocksConsumer; + +/** + * Factory pattern for consumer that implement the no-sketch strategy for + * signature blocks sketches. + * + * @author Heiko Mueller + */ +public class SignatureBlocksNoSketchFactory implements SignatureBlocksSketchFactory { + + @Override + public SignatureBlocksConsumer getConsumer(SignatureBlocksConsumer consumer) { + + return consumer; + } + + @Override + public String toDocString() { + + return "null"; + } +} diff --git a/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSizeSketch.java b/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSizeSketch.java new file mode 100644 index 0000000..2b822a9 --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSizeSketch.java @@ -0,0 +1,81 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.signature.sketch; + +import java.util.ArrayList; +import java.util.List; +import org.opendata.curation.d4.signature.SignatureBlocks; +import org.opendata.curation.d4.signature.SignatureBlocksConsumer; +import org.opendata.curation.d4.signature.SignatureBlocksImpl; + +/** + * Consumer that prunes individual blocks in a signature. The size sketch + * consumer ensures that no block in a signature contains more nodes than a + * given threshold. + * + * In the default implementation only the first n elements in each block are + * kept. + * + * @author Heiko Mueller + */ +public class SignatureBlocksSizeSketch implements SignatureBlocksConsumer { + + private final SignatureBlocksConsumer _consumer; + public final int _n; + + public SignatureBlocksSizeSketch(int n, SignatureBlocksConsumer consumer) { + + _n = n; + _consumer = consumer; + + } + + @Override + public void close() { + + _consumer.close(); + } + + @Override + public void consume(SignatureBlocks sig) { + + List blocks = new ArrayList<>(); + for (int iBlock = 0; iBlock < sig.size(); iBlock++) { + int[] block = sig.get(iBlock); + if (block.length > _n) { + int[] trimmedBlock = new int[_n]; + System.arraycopy(block, 0, trimmedBlock, 0, _n); + block = trimmedBlock; + } + blocks.add(block); + } + _consumer.consume(new SignatureBlocksImpl(sig.id(), sig.maxSim(), blocks)); + } + + @Override + public boolean isDone() { + + return _consumer.isDone(); + } + + @Override + public void open() { + + _consumer.open(); + } +} diff --git a/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSizeSketchFactory.java b/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSizeSketchFactory.java new file mode 100644 index 0000000..c6d5c2e --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSizeSketchFactory.java @@ -0,0 +1,48 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.signature.sketch; + +import org.opendata.curation.d4.signature.SignatureBlocksConsumer; + +/** + * Factory for consumer that implement the size-threshold sketch strategy for + * signature blocks. + * + * @author Heiko Mueller + */ +public class SignatureBlocksSizeSketchFactory implements SignatureBlocksSketchFactory { + + private final int _n; + + public SignatureBlocksSizeSketchFactory(int n) { + + _n = n; + } + + @Override + public SignatureBlocksConsumer getConsumer(SignatureBlocksConsumer consumer) { + + return new SignatureBlocksSizeSketch(_n, consumer); + } + + @Override + public String toDocString() { + + return String.format("N%d", _n); + } +} diff --git a/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSketchFactory.java b/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSketchFactory.java new file mode 100644 index 0000000..b348c6a --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSketchFactory.java @@ -0,0 +1,46 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.signature.sketch; + +import org.opendata.curation.d4.signature.SignatureBlocksConsumer; + +/** + * Factory pattern for generating signature blocks consumers for sketches. + * Each type of sketch generator class will implement their own factory. + * + * @author Heiko Mueller + */ +public interface SignatureBlocksSketchFactory { + + /** + * Get a signature blocks sketch consumer. The returned consumer will pass + * the modified signatures to the consumer that is given as the argument. + * + * @param consumer + * @return + */ + public SignatureBlocksConsumer getConsumer(SignatureBlocksConsumer consumer); + + /** + * Get documentation string for the signature blocks sketches that are + * created by the consumer that are returned by this factory. + * + * @return + */ + public String toDocString(); +} From 47b0cb2f96e4851b476f088bdfe5ec002de0c396 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Thu, 10 Dec 2020 12:53:18 -0500 Subject: [PATCH 21/25] Add signature drop stats experiment --- pom.xml | 6 +- .../SignatureDropStatsExperiment.java} | 133 +++++------------- 2 files changed, 40 insertions(+), 99 deletions(-) rename src/main/java/org/opendata/curation/d4/{signature/SignatureBlocksDropWriter.java => experiments/SignatureDropStatsExperiment.java} (58%) diff --git a/pom.xml b/pom.xml index b1f4f8e..6ee1046 100644 --- a/pom.xml +++ b/pom.xml @@ -70,13 +70,13 @@ true - org.opendata.curation.d4.D4 - + + - + org.opendata.curation.d4.experiments.SignatureDropStatsExperiment diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksDropWriter.java b/src/main/java/org/opendata/curation/d4/experiments/SignatureDropStatsExperiment.java similarity index 58% rename from src/main/java/org/opendata/curation/d4/signature/SignatureBlocksDropWriter.java rename to src/main/java/org/opendata/curation/d4/experiments/SignatureDropStatsExperiment.java index cc9ca42..3195106 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksDropWriter.java +++ b/src/main/java/org/opendata/curation/d4/experiments/SignatureDropStatsExperiment.java @@ -15,13 +15,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.opendata.curation.d4.signature; +package org.opendata.curation.d4.experiments; +import org.opendata.curation.d4.signature.*; import java.io.File; import java.io.PrintWriter; import java.math.BigDecimal; -import java.math.RoundingMode; -import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.concurrent.ConcurrentLinkedQueue; @@ -35,52 +34,16 @@ import org.opendata.core.io.SynchronizedWriter; import org.opendata.core.prune.CandidateSetFinder; import org.opendata.core.prune.MaxDropFinder; -import org.opendata.core.set.IDSet; +import org.opendata.db.eq.EQ; import org.opendata.db.eq.EQIndex; /** - * Generate output file containing information about the steepest drops in - * context signatures. - * - * The output contains a single tab-delimited line for each equivalence class - * containing the following information: - * - * - equivalence class identifier - * - similarity of first context signature entry - * - list of similarities for nodes where the steepest drop occurs (separated - * by ':'. + * Experiment to evaluate the impact of the last drop and full signature + * constraint on signature blocks generation. * * @author Heiko Mueller */ -public class SignatureBlocksDropWriter { - - private class SignatureDrop { - - private final int _blockLength; - private final int _columnCount; - private final BigDecimal _firstElement; - private final BigDecimal _lastElement; - - public SignatureDrop(BigDecimal first, BigDecimal last, int blockLength, int columnCount) { - - _firstElement = first; - _lastElement = last; - _blockLength = blockLength; - _columnCount = columnCount; - } - - @Override - public String toString() { - - return String.format( - "%s-%s:%d:%d", - _firstElement.setScale(2, RoundingMode.HALF_DOWN).toPlainString(), - _lastElement.setScale(2, RoundingMode.HALF_DOWN).toPlainString(), - _blockLength, - _columnCount - ); - } - } +public class SignatureDropStatsExperiment { private class BlockGeneratorTask implements Runnable { @@ -115,41 +78,36 @@ public void run() { if (sig.isEmpty()) { continue; } - int start = 0; - final int end = sig.size(); - ArrayList drops = new ArrayList<>(); - while (start < end) { - int pruneIndex = _candidateFinder.getPruneIndex(sig, start); - if (pruneIndex <= start) { - break; - } - int blockLen = pruneIndex - start; - IDSet columns = _eqIndex.get(nodeId).columns(); - for (int iEl = start; iEl < pruneIndex; iEl++) { - int memberId = sig.get(iEl).id(); - columns = columns.intersect(_eqIndex.get(memberId).columns()); - if (columns.isEmpty()) { - break; - } - } - drops.add( - new SignatureDrop( - sig.get(start).toBigDecimal(), - sig.get(pruneIndex - 1).toBigDecimal(), - blockLen, - columns.length() - ) - ); - start = pruneIndex; + int pruneIndex = _candidateFinder.getPruneIndex(sig); + double rightBound; + if (pruneIndex == sig.size()) { + rightBound = 0; + } else { + rightBound = sig.get(pruneIndex).value(); } - if (drops.isEmpty()) { - continue; + double drop = sig.get(pruneIndex - 1).value() - rightBound; + double lastValue = sig.get(sig.size() - 1).value(); + boolean lastDropIsGreater = drop < lastValue; + double fullSigDiff = sig.get(0).value() - lastValue; + boolean isFullSig = fullSigDiff < lastValue; + String type = null; + if (isFullSig) { + type = "F"; + } else if (lastDropIsGreater) { + type = "L"; } - String line = nodeId + "\t" + drops.get(0).toString(); - for (int iDrop = 1; iDrop < drops.size(); iDrop++) { - line += "|" + drops.get(iDrop).toString(); + if (type != null) { + EQ eq = _eqIndex.get(nodeId); + String line = String.format( + "%d\t%s\t%f\t%d\t%d", + nodeId, + type, + sig.get(0).value(), + eq.columns().length(), + sig.size() + ); + _writer.write(line); } - _writer.write(line); } } } @@ -160,25 +118,11 @@ public void run( ConcurrentLinkedQueue queue, CandidateSetFinder candidateFinder, int threads, - boolean verbose, SynchronizedWriter writer ) throws java.lang.InterruptedException, java.io.IOException { - if (verbose) { - System.out.println( - String.format( - "SIGNATURE BLOCKS FOR %d EQs USING:\n" + - " --threads=%d", - queue.size(), - threads - ) - ); - } - Date start = new Date(); - if (verbose) { - System.out.println("START @ " + start); - } + System.out.println("START @ " + start); ExecutorService es = Executors.newCachedThreadPool(); for (int iThread = 0; iThread < threads; iThread++) { @@ -200,9 +144,7 @@ public void run( } Date end = new Date(); - if (verbose) { - System.out.println("END @ " + end); - } + System.out.println("END @ " + end); } private final static String COMMAND = @@ -212,7 +154,7 @@ public void run( " "; private final static Logger LOGGER = Logger - .getLogger(SignatureBlocksDropWriter.class.getName()); + .getLogger(SignatureDropStatsExperiment.class.getName()); public static void main(String[] args) { @@ -237,13 +179,12 @@ public static void main(String[] args) { try (PrintWriter out = FileSystem.openPrintWriter(outputFile)) { EQIndex eqIndex = new EQIndex(eqFile); - new SignatureBlocksDropWriter().run( + new SignatureDropStatsExperiment().run( eqIndex, new ContextSignatureGenerator(eqIndex.nodes()), new ConcurrentLinkedQueue<>(eqIndex.keys().toList()), candidateFinder, threads, - true, new SynchronizedWriter(out) ); } catch (java.lang.InterruptedException | java.io.IOException ex) { From 45b9a244ebbdc1cda60b679a000461b6106242cb Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Thu, 10 Dec 2020 12:54:49 -0500 Subject: [PATCH 22/25] Add ignore minor drop option for signature generator --- .../core/prune/CandidateSetFinder.java | 137 ------------------ .../core/prune/MaxDropThresholdFinder.java | 137 ------------------ .../core/prune/NoPruneCandidateSet.java | 37 ----- .../org/opendata/curation/d4/Constants.java | 2 +- .../java/org/opendata/curation/d4/D4.java | 16 +- .../signature/SignatureBlocksGenerator.java | 123 ++++++++++++---- .../d4/signature/SignatureBlocksWriter.java | 9 +- .../data/test/prune/MaxDropFinderTest.java | 5 - .../prune/MaxDropThresholdFinderTest.java | 95 ------------ 9 files changed, 116 insertions(+), 445 deletions(-) delete mode 100644 src/main/java/org/opendata/core/prune/MaxDropThresholdFinder.java delete mode 100644 src/main/java/org/opendata/core/prune/NoPruneCandidateSet.java delete mode 100644 src/test/java/org/urban/data/test/prune/MaxDropThresholdFinderTest.java diff --git a/src/main/java/org/opendata/core/prune/CandidateSetFinder.java b/src/main/java/org/opendata/core/prune/CandidateSetFinder.java index dde13ee..cae5cef 100644 --- a/src/main/java/org/opendata/core/prune/CandidateSetFinder.java +++ b/src/main/java/org/opendata/core/prune/CandidateSetFinder.java @@ -17,13 +17,8 @@ */ package org.opendata.core.prune; -import java.util.ArrayList; import java.util.List; -import org.opendata.core.constraint.Threshold; import org.opendata.core.object.IdentifiableDouble; -import org.opendata.core.set.IDSet; -import org.opendata.core.set.ImmutableIDSet; -import org.opendata.core.util.StringHelper; /** * For a given list of identifiable double, find the pruning index for an @@ -33,80 +28,6 @@ * @param */ public abstract class CandidateSetFinder { - - // Drop finder names - public static final String MAX_DIFF = "MAX-DIFF"; - public static final String MAX_DIFF_THRESHOLD = "MAX-DIFF-THRESHOLD"; - public static final String THRESHOLD = "THRESHOLD"; - - // Drop finder specification syntax - public static final String MAXDIFFFINDER = - MAX_DIFF + - ":" + - ":[true | false]" + - ":[true | false]"; - public static final String MAXDIFFTHRESHOLDFINDER = - MAX_DIFF_THRESHOLD + - ":" + - ":[true | false]" + - ":[true | false]"; - public final static String THRESHOLDFINDER = - THRESHOLD + ":"; - - /** - * Print command line statement for drop finder arguments. - * - * @param indent - * @return - */ - public static String getCommand(String indent) { - - return indent + MAXDIFFFINDER + " |\n" + - indent + MAXDIFFTHRESHOLDFINDER + " |\n" + - indent + THRESHOLDFINDER; - } - - /** - * Get candidate set finder instance from specification string. - * - * @param spec - * @return - */ - public static CandidateSetFinder getFunction(String spec) { - - String[] tokens = spec.split(":"); - - try { - String name = tokens[0]; - if (name.equalsIgnoreCase(MAX_DIFF)) { - if (tokens.length == 4) { - return new MaxDropFinder( - Threshold.getConstraint(tokens[1]), - Boolean.parseBoolean(tokens[2]), - Boolean.parseBoolean(tokens[3]) - ); - } - } else if (name.equalsIgnoreCase(MAX_DIFF_THRESHOLD)) { - if (tokens.length == 4) { - return new MaxDropThresholdFinder( - Threshold.getConstraint(tokens[1]), - Boolean.parseBoolean(tokens[2]), - Boolean.parseBoolean(tokens[3]) - ); - } - } else if (name.equalsIgnoreCase(THRESHOLD)) { - if (tokens.length == 2) { - return new ThresholdFinder( - Threshold.getConstraint(tokens[1]) - ); - } - } else { - throw new java.lang.IllegalArgumentException("Unknown candidate set finder: " + name); - } - } catch (java.lang.NumberFormatException ex) { - } - throw new java.lang.IllegalArgumentException("Invalid candidate set finder specification: " + spec); - } /** * Return the pruning index. @@ -127,62 +48,4 @@ public int getPruneIndex(List elements) { * @return */ public abstract int getPruneIndex(List elements, int start); - - /** - * Return identifier of elements that occur before the pruning index. - * - * @param elements - * @return - */ - public IDSet pruneElements(List elements) { - - int pruneIndex = this.getPruneIndex(elements); - if (pruneIndex > 0) { - List result = new ArrayList<>(); - for (int iIndex = 0; iIndex < pruneIndex; iIndex++) { - result.add(elements.get(iIndex).id()); - } - return new ImmutableIDSet(result); - } else { - return new ImmutableIDSet(); - } - } - - /** - * Validate a given drop finder specification. - * - * Return the given specification if valid. Will raise - * IllegalArgumentException if specification is not valid. - * - * @param spec - * @return - */ - public static String validateSpecification(String spec) { - - String[] tokens = spec.split(":"); - - String message = "Invalid candidate set finder specification: " + spec; - - String name = tokens[0]; - if (name.equalsIgnoreCase(MAX_DIFF)) { - if (tokens.length == 4) { - Threshold.validateSpecification(tokens[1]); - } else { - throw new java.lang.IllegalArgumentException(message); - } - } else if (name.equalsIgnoreCase(MAX_DIFF_THRESHOLD)) { - if (tokens.length == 4) { - Threshold.validateSpecification(tokens[1]); - } else { - throw new java.lang.IllegalArgumentException(message); - } - } else if (name.equalsIgnoreCase(THRESHOLD)) { - Threshold - .validateSpecification(StringHelper.joinStrings(tokens, 1, ":")); - } else { - throw new java.lang.IllegalArgumentException("Unknown drop finder name: " + name); - } - - return spec; - } } diff --git a/src/main/java/org/opendata/core/prune/MaxDropThresholdFinder.java b/src/main/java/org/opendata/core/prune/MaxDropThresholdFinder.java deleted file mode 100644 index a16e253..0000000 --- a/src/main/java/org/opendata/core/prune/MaxDropThresholdFinder.java +++ /dev/null @@ -1,137 +0,0 @@ -/* - * This file is part of the Data-Driven Domain Discovery Tool (D4). - * - * Copyright (c) 2018-2020 New York University. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.opendata.core.prune; - -import java.util.List; -import org.opendata.core.constraint.GreaterThanConstraint; -import org.opendata.core.constraint.Threshold; -import org.opendata.core.object.IdentifiableDouble; - -/** - * Find steepest drop in list of identifiable double values. - * - * Include all element that satisfy the threshold constraint or occur before the - * steepest drop. - * - * @author Heiko Mueller - * @param - */ -public class MaxDropThresholdFinder extends CandidateSetFinder { - - private final Threshold _nonEmptySignatureThreshold; - private final boolean _ignoreLastDrop; - private final boolean _fullSignatureConstraint; - - public MaxDropThresholdFinder( - Threshold nonEmptySignatureThreshold, - boolean fullSignatureConstraint, - boolean ignoreLastDrop - ) { - _nonEmptySignatureThreshold = nonEmptySignatureThreshold; - _fullSignatureConstraint = fullSignatureConstraint; - _ignoreLastDrop = ignoreLastDrop; - } - - public MaxDropThresholdFinder( - double nonEmptySignatureThreshold, - boolean fullSignatureConstraint, - boolean ignoreLastDrop - ) { - this( - new GreaterThanConstraint(nonEmptySignatureThreshold), - fullSignatureConstraint, - ignoreLastDrop - ); - } - - /** - * Get index position of steepest drop in a list of double values. - * - * Assumes that the list is sorted in decreasing order. Returns the index - * position of the element on the right side of the steepest drop. - * - * If the list is empty the result is 0. If the first element is smaller - * than the empty constraint threshold the result is 0. If the full - * signature constraint is satisfied the result is the size of the element - * vector. - * - * If the list contains a single element the result is 1 or 0 (in case the - * empty signature constraint is satisfied). - * - * @param elements - * @param start - * @return - */ - @Override - public int getPruneIndex(List elements, int start) { - - final int size = elements.size(); - - // Result is zero if element list is empty. - if (start >= size) { - return 0; - } - - // Result is zero if first element is smaller than the empty signature - // constraint threshold. - final double first = elements.get(0).value(); - if (!_nonEmptySignatureThreshold.isSatisfied(first)) { - return 0; - } - - // Return 1 if the size of the list is one - if ((size - start) == 1) { - return start + 1; - } - - // If the full signature constraint is satisfied the result equals the - // size of the array - final double last = elements.get(size - 1).value(); - if (_fullSignatureConstraint) { - if ((first - last) <= last) { - return size; - } - } - - // The initial value for maxDiff depends on whether we ignore the last - // drop or not. In the latter case, maxDiff is the value of the last - // drop. In the former case it is zero. - double maxDiff; - if (!_ignoreLastDrop) { - maxDiff = last; - } else { - maxDiff = 0f; - } - - int maxIndex = elements.size(); - int maxThresholdIndex = 0; - for (int iIndex = start; iIndex < size - 1; iIndex++) { - final double val = elements.get(iIndex).value(); - final double diff = val - elements.get(iIndex + 1).value(); - if (diff > maxDiff) { - maxIndex = iIndex + 1; - maxDiff = diff; - } - if (_nonEmptySignatureThreshold.isSatisfied(val)) { - maxThresholdIndex = iIndex + 1; - } - } - - return Math.max(maxIndex, maxThresholdIndex); - } -} diff --git a/src/main/java/org/opendata/core/prune/NoPruneCandidateSet.java b/src/main/java/org/opendata/core/prune/NoPruneCandidateSet.java deleted file mode 100644 index a7f4c47..0000000 --- a/src/main/java/org/opendata/core/prune/NoPruneCandidateSet.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * This file is part of the Data-Driven Domain Discovery Tool (D4). - * - * Copyright (c) 2018-2020 New York University. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.opendata.core.prune; - -import java.util.List; -import org.opendata.core.object.IdentifiableDouble; - -/** - * The "NO Pruning" candidate set finder returns all elements in a given list - * without any pruning. - * - * @author Heiko Mueller - * @param - */ -public class NoPruneCandidateSet extends CandidateSetFinder { - - @Override - public int getPruneIndex(List elements, int start) { - - return elements.size(); - } -} diff --git a/src/main/java/org/opendata/curation/d4/Constants.java b/src/main/java/org/opendata/curation/d4/Constants.java index 8689938..b84f98b 100644 --- a/src/main/java/org/opendata/curation/d4/Constants.java +++ b/src/main/java/org/opendata/curation/d4/Constants.java @@ -26,5 +26,5 @@ public final class Constants { public static final String NAME = "D4 - Data-Driven Domain Discovery"; - public static final String VERSION = "0.29.0.dev15"; + public static final String VERSION = "0.29.0.dev16"; } \ No newline at end of file diff --git a/src/main/java/org/opendata/curation/d4/D4.java b/src/main/java/org/opendata/curation/d4/D4.java index 17fede4..8481270 100644 --- a/src/main/java/org/opendata/curation/d4/D4.java +++ b/src/main/java/org/opendata/curation/d4/D4.java @@ -53,7 +53,6 @@ import org.opendata.curation.d4.signature.sketch.SignatureBlocksSizeSketchFactory; import org.opendata.curation.d4.signature.sketch.SignatureBlocksSketchFactory; import org.opendata.curation.d4.signature.trim.SignatureTrimmer; -import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; import org.opendata.db.Database; import org.opendata.db.column.Column; import org.opendata.db.eq.CompressedTermIndexGenerator; @@ -207,6 +206,7 @@ public void signatures( String trimmerSpec, boolean fullSignatureConstraint, boolean ignoreLastDrop, + boolean ignoreMinorDrop, int threads, boolean verbose, TelemetryCollector telemetry, @@ -217,12 +217,13 @@ public void signatures( new SignatureBlocksGenerator(telemetry).runWithMaxDrop( nodeIndex, new ConcurrentLinkedQueue<>(nodeIndex.keys().toList()), + trimmerSpec, fullSignatureConstraint, ignoreLastDrop, + ignoreMinorDrop, threads, verbose, - new SignatureTrimmerFactory(nodeIndex, nodeIndex.columns(), trimmerSpec) - .getTrimmer(sigWriter) + sigWriter ); if (verbose) { @@ -420,6 +421,9 @@ public static void main(String[] args) { " [default: 'compressed-term-index.txt.gz']" ), new Parameter("trimmer", " [default: LIBERAL]"), + new Parameter("fullSignatureConstraint", " [default: true]"), + new Parameter("ignoreLastDrop", " [default: true]"), + new Parameter("ignoreMinorDrop", " [default: true]"), new Parameter("threads", " [default: 6]"), new Parameter("verbose", " [default: true]"), new Parameter("signatures", " [default: 'signatures.txt.gz']") @@ -431,14 +435,16 @@ public static void main(String[] args) { int threads = params.getAsInt("threads", 6); boolean verbose = params.getAsBool("verbose", true); File signatureFile = params.getAsFile("signatures", "signatures.txt.gz"); - boolean fullSignatureConstraint = false; - boolean ignoreLastDrop = true; + boolean fullSignatureConstraint = params.getAsBool("fullSignatureConstraint", true); + boolean ignoreLastDrop = params.getAsBool("ignoreLastDrop", true); + boolean ignoreMinorDrop = params.getAsBool("ignoreMinorDrop", true); try { new D4().signatures( new EQIndex(eqFile), trimmerSpec, fullSignatureConstraint, ignoreLastDrop, + ignoreMinorDrop, threads, verbose, new TelemetryPrinter(), diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java index 4c65b78..36bec46 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java @@ -33,6 +33,8 @@ import org.opendata.core.prune.CandidateSetFinder; import org.opendata.core.prune.MaxDropFinder; import org.opendata.core.prune.ThresholdFinder; +import org.opendata.curation.d4.signature.trim.SignatureTrimmer; +import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; import org.opendata.db.eq.EQIndex; /** @@ -57,10 +59,15 @@ public class SignatureBlocksGenerator { public static final String DEFAULT = MAX_DROP; + /** + * Worker for generating signature blocks for equivalence classes. + * + */ private class BlockGeneratorTask implements Runnable { private final CandidateSetFinder _candidateFinder; private final SignatureBlocksConsumer _consumer; + private final boolean _ignoreMinorDrop; private final ConcurrentLinkedQueue _queue; private final ContextSignatureGenerator _sigFact; @@ -68,15 +75,27 @@ public BlockGeneratorTask( ConcurrentLinkedQueue queue, ContextSignatureGenerator sigFact, CandidateSetFinder candidateFinder, + boolean ignoreMinorDrop, SignatureBlocksConsumer consumer ) { _queue = queue; _sigFact = sigFact; _candidateFinder = candidateFinder; + _ignoreMinorDrop = ignoreMinorDrop; _consumer = consumer; } + private int[] getBlock(List sig, int start, int end) { + + int[] block = new int[end - start]; + for (int iEl = start; iEl < end; iEl++) { + block[iEl - start] = sig.get(iEl).id(); + } + Arrays.sort(block); + return block; + } + @Override public void run() { @@ -88,29 +107,51 @@ public void run() { if (sig.isEmpty()) { continue; } + ArrayList blocks = new ArrayList<>(); int start = 0; final int end = sig.size(); - ArrayList blocks = new ArrayList<>(); while (start < end) { int pruneIndex = _candidateFinder.getPruneIndex(sig, start); if (pruneIndex <= start) { break; } - int[] block = new int[pruneIndex - start]; - for (int iEl = start; iEl < pruneIndex; iEl++) { - block[iEl - start] = sig.get(iEl).id(); + // If the ignoreMinorDrop flag is true check that the + // difference at the drop is at least as large as the + // difference between the elements in the block. + if (_ignoreMinorDrop) { + double rightBound = 0; + if (sig.size() < pruneIndex) { + rightBound = sig.get(pruneIndex).value(); + } + double leftBound = sig.get(pruneIndex - 1).value(); + double diff = leftBound - rightBound; + double blockDiff = sig.get(start).value() - leftBound; + if (blockDiff > diff) { + // We encountered a minor drop. If the list of + // blocks is empty (i.e., there is no steepest drop + // but the full signature constrant is not satisfied + // either) we break to return an empty signature. + // Otherwise, we add the remaining elements as the + // final block. + if (blocks.isEmpty()) { + break; + } else { + pruneIndex = end; + } + } } - Arrays.sort(block); - blocks.add(block); + blocks.add(this.getBlock(sig, start, pruneIndex)); start = pruneIndex; } - _consumer.consume( - new SignatureBlocksImpl( - nodeId, - sig.get(0).toBigDecimal(), - blocks - ) - ); + if (!blocks.isEmpty()) { + _consumer.consume( + new SignatureBlocksImpl( + nodeId, + sig.get(0).toBigDecimal(), + blocks + ) + ); + } } } } @@ -131,22 +172,12 @@ private void compute( ContextSignatureGenerator sigFact, ConcurrentLinkedQueue queue, CandidateSetFinder candidateFinder, + boolean ignoreMinorDrop, int threads, boolean verbose, SignatureBlocksConsumer consumer ) throws java.lang.InterruptedException, java.io.IOException { - if (verbose) { - System.out.println( - String.format( - "SIGNATURE BLOCKS FOR %d EQs USING:\n" + - " --threads=%d", - queue.size(), - threads - ) - ); - } - Date start = new Date(); if (verbose) { System.out.println("START @ " + start); @@ -161,6 +192,7 @@ private void compute( queue, sigFact, candidateFinder, + ignoreMinorDrop, consumer ) ); @@ -212,6 +244,7 @@ public void runWithThreshold( new ContextSignatureGenerator(eqIndex.nodes()), queue, candidateFinder, + false, threads, verbose, consumer @@ -223,24 +256,52 @@ public void runWithThreshold( * * @param eqIndex * @param queue + * @param trimmerSpec * @param fullSignatureConstraint * @param ignoreLastDrop + * @param ignoreMinorDrop * @param threads * @param verbose - * @param consumer + * @param writer * @throws java.lang.InterruptedException * @throws java.io.IOException */ public void runWithMaxDrop( EQIndex eqIndex, ConcurrentLinkedQueue queue, + String trimmerSpec, boolean fullSignatureConstraint, boolean ignoreLastDrop, + boolean ignoreMinorDrop, int threads, boolean verbose, - SignatureBlocksConsumer consumer + SignatureBlocksWriter writer ) throws java.lang.InterruptedException, java.io.IOException { + if (verbose) { + System.out.println( + String.format( + "SIGNATURE BLOCKS FOR %d EQs USING:\n" + + " --eqs=%s\n" + + " --trimmer=%s\n" + + " --fullSignatureConstraint=%s\n" + + " --ignoreLastDrop=%s\n" + + " --ignoreMinorDrop=%s\n" + + " --threads=%d\n" + + " --signatures=%s", + queue.size(), + eqIndex.source(), + trimmerSpec, + Boolean.toString(fullSignatureConstraint), + Boolean.toString(ignoreLastDrop), + Boolean.toString(ignoreMinorDrop), + threads, + writer.target() + ) + ); + } + + MaxDropFinder candidateFinder; candidateFinder = new MaxDropFinder<>( new GreaterThanConstraint(BigDecimal.ZERO), @@ -248,13 +309,18 @@ public void runWithMaxDrop( ignoreLastDrop ); + SignatureTrimmer trimmer; + trimmer = new SignatureTrimmerFactory(eqIndex, eqIndex.columns(), trimmerSpec) + .getTrimmer(writer); + this.compute( new ContextSignatureGenerator(eqIndex.nodes()), queue, candidateFinder, + ignoreMinorDrop, threads, verbose, - consumer + trimmer ); } @@ -263,6 +329,7 @@ public void runWithMaxDrop( * * @param eqIndex * @param queue + * @param ignoreMinorDrop * @param threads * @param verbose * @param consumer @@ -272,6 +339,7 @@ public void runWithMaxDrop( public void runWithMaxDrop( EQIndex eqIndex, ConcurrentLinkedQueue queue, + boolean ignoreMinorDrop, int threads, boolean verbose, SignatureBlocksConsumer consumer @@ -288,6 +356,7 @@ public void runWithMaxDrop( new ContextSignatureGenerator(eqIndex.nodes()), queue, candidateFinder, + ignoreMinorDrop, threads, verbose, consumer diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriter.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriter.java index 87f1560..66e3b62 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriter.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriter.java @@ -20,6 +20,7 @@ import java.io.File; import java.io.PrintWriter; import org.opendata.core.io.FileSystem; +import org.opendata.core.io.prov.DataSink; import org.opendata.core.util.FormatedBigDecimal; import org.opendata.core.util.StringHelper; @@ -27,7 +28,7 @@ * * @author Heiko Mueller */ -public class SignatureBlocksWriter implements SignatureBlocksConsumer { +public class SignatureBlocksWriter implements DataSink, SignatureBlocksConsumer { private final File _file; private int _openCount = 0; @@ -78,6 +79,12 @@ public synchronized void open() { } _openCount++; } + + @Override + public String target() { + + return _file.getName(); + } public void write(SignatureBlocksIndex signatures) { diff --git a/src/test/java/org/urban/data/test/prune/MaxDropFinderTest.java b/src/test/java/org/urban/data/test/prune/MaxDropFinderTest.java index 6434884..183d1a9 100644 --- a/src/test/java/org/urban/data/test/prune/MaxDropFinderTest.java +++ b/src/test/java/org/urban/data/test/prune/MaxDropFinderTest.java @@ -84,11 +84,6 @@ public void testDropFinder() { // The largest drop is between elements 2 and 3 assertEquals(3, dropFinder.getPruneIndex(elements)); - IDSet nodes = dropFinder.pruneElements(elements); - assertEquals(3, nodes.length()); - for (int nodeId : new int[]{10, 1, 2}) { - assertTrue(nodes.contains(nodeId)); - } elements.add(3, new IdentifiableDouble(10, 0.55)); diff --git a/src/test/java/org/urban/data/test/prune/MaxDropThresholdFinderTest.java b/src/test/java/org/urban/data/test/prune/MaxDropThresholdFinderTest.java deleted file mode 100644 index 4a562a9..0000000 --- a/src/test/java/org/urban/data/test/prune/MaxDropThresholdFinderTest.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * This file is part of the Data-Driven Domain Discovery Tool (D4). - * - * Copyright (c) 2018-2020 New York University. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.urban.data.test.prune; - -import java.util.ArrayList; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; -import static org.junit.Assert.*; -import org.opendata.core.object.IdentifiableDouble; -import org.opendata.core.prune.MaxDropFinder; -import org.opendata.core.prune.MaxDropThresholdFinder; - -/** - * - * @author Heiko Mueller - */ -public class MaxDropThresholdFinderTest { - - @BeforeClass - public static void setUpClass() { - } - - @AfterClass - public static void tearDownClass() { - } - - @Before - public void setUp() { - } - - @After - public void tearDown() { - } - - @Test - public void testDropFinder() { - - ArrayList elements = new ArrayList<>(); - elements.add(new IdentifiableDouble(10, 0.75)); - elements.add(new IdentifiableDouble(1, 0.7)); - elements.add(new IdentifiableDouble(2, 0.6)); - elements.add(new IdentifiableDouble(3, 0.57)); - elements.add(new IdentifiableDouble(7, 0.55)); - elements.add(new IdentifiableDouble(8, 0.49)); - elements.add(new IdentifiableDouble(4, 0.48)); - elements.add(new IdentifiableDouble(5, 0.45)); - elements.add(new IdentifiableDouble(6, 0.4)); - elements.add(new IdentifiableDouble(11, 0.35)); - elements.add(new IdentifiableDouble(12, 0.28)); - - MaxDropFinder dropFinder; - dropFinder = new MaxDropFinder<>(0.5, true, true); - - assertEquals(2, new MaxDropFinder<>(0.5, true, true).getPruneIndex(elements)); - assertEquals(elements.size(), new MaxDropFinder<>(0.5, true, false).getPruneIndex(elements)); - assertEquals(5, new MaxDropThresholdFinder<>(0.5, true, true).getPruneIndex(elements)); - assertEquals(elements.size(), new MaxDropFinder<>(0.5, true, false).getPruneIndex(elements)); - - elements = new ArrayList<>(); - elements.add(new IdentifiableDouble(10, 0.75)); - elements.add(new IdentifiableDouble(1, 0.7)); - elements.add(new IdentifiableDouble(1, 0.77)); - elements.add(new IdentifiableDouble(1, 0.73)); - elements.add(new IdentifiableDouble(2, 0.6)); - elements.add(new IdentifiableDouble(3, 0.57)); - elements.add(new IdentifiableDouble(7, 0.55)); - elements.add(new IdentifiableDouble(8, 0.5)); - elements.add(new IdentifiableDouble(4, 0.48)); - elements.add(new IdentifiableDouble(5, 0.45)); - elements.add(new IdentifiableDouble(6, 0.3)); - elements.add(new IdentifiableDouble(11, 0.35)); - elements.add(new IdentifiableDouble(12, 0.28)); - - assertEquals(10, new MaxDropFinder<>(0.5, true, true).getPruneIndex(elements)); - assertEquals(10, new MaxDropThresholdFinder<>(0.5, true, true).getPruneIndex(elements)); -} -} From 187042bbbb375e38f2879fb91fd9b3e35a8c4373 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Thu, 10 Dec 2020 15:20:01 -0500 Subject: [PATCH 23/25] Refactor signature trimmer --- pom.xml | 4 +- .../org/opendata/curation/d4/Constants.java | 2 +- .../java/org/opendata/curation/d4/D4.java | 26 +- .../curation/d4/column/ColumnExpander.java | 12 +- .../d4/column/ParallelColumnExpander.java | 17 +- .../d4/column/SingleColumnExpander.java | 8 +- .../ExternalMemLocalDomainGenerator.java | 18 +- .../d4/domain/InMemLocalDomainGenerator.java | 12 +- .../d4/domain/UndirectedDomainGenerator.java | 18 +- .../ConcurrentSignatureBlocksStream.java | 84 ------- ...natureBlocks.java => RobustSignature.java} | 23 +- ...Buffer.java => RobustSignatureBuffer.java} | 22 +- .../d4/signature/RobustSignatureConsumer.java | 32 +++ ...er.java => RobustSignatureDispatcher.java} | 29 +-- ...tor.java => RobustSignatureGenerator.java} | 237 ++++++------------ ...ocksImpl.java => RobustSignatureImpl.java} | 11 +- ...ksIndex.java => RobustSignatureIndex.java} | 29 +-- ...Stream.java => RobustSignatureStream.java} | 4 +- .../curation/d4/signature/SignatureBlock.java | 59 +++++ .../d4/signature/SignatureBlocksConsumer.java | 5 +- .../d4/signature/SignatureBlocksReader.java | 61 +++-- .../d4/signature/SignatureBlocksStats.java | 76 +++--- .../d4/signature/SignatureBlocksWriter.java | 26 +- .../SignatureBlocksNoSketchFactory.java | 6 +- .../sketch/SignatureBlocksSizeSketch.java | 21 +- .../SignatureBlocksSizeSketchFactory.java | 4 +- .../sketch/SignatureBlocksSketchFactory.java | 4 +- .../trim/CentristBlockRelevanceFilter.java | 128 ---------- .../d4/signature/trim/CentristSignature.java | 10 +- .../d4/signature/trim/CentristTrimmer.java | 45 +--- .../trim/ColumnSupportBlockFilter.java | 26 +- .../signature/trim/ConservativeTrimmer.java | 22 +- ...alTrimmer.java => LiberalRobustifier.java} | 36 +-- .../d4/signature/trim/NonTrimmer.java | 18 +- .../signature/trim/SignatureRobustifier.java | 83 ++++++ .../d4/signature/trim/SignatureTrimmer.java | 38 +-- .../trim/SignatureTrimmerFactory.java | 34 +-- 37 files changed, 508 insertions(+), 782 deletions(-) delete mode 100644 src/main/java/org/opendata/curation/d4/signature/ConcurrentSignatureBlocksStream.java rename src/main/java/org/opendata/curation/d4/signature/{SignatureBlocks.java => RobustSignature.java} (68%) rename src/main/java/org/opendata/curation/d4/signature/{SignatureBlocksBuffer.java => RobustSignatureBuffer.java} (73%) create mode 100644 src/main/java/org/opendata/curation/d4/signature/RobustSignatureConsumer.java rename src/main/java/org/opendata/curation/d4/signature/{SignatureBlocksDispatcher.java => RobustSignatureDispatcher.java} (64%) rename src/main/java/org/opendata/curation/d4/signature/{SignatureBlocksGenerator.java => RobustSignatureGenerator.java} (65%) rename src/main/java/org/opendata/curation/d4/signature/{SignatureBlocksImpl.java => RobustSignatureImpl.java} (79%) rename src/main/java/org/opendata/curation/d4/signature/{SignatureBlocksIndex.java => RobustSignatureIndex.java} (72%) rename src/main/java/org/opendata/curation/d4/signature/{SignatureBlocksStream.java => RobustSignatureStream.java} (87%) create mode 100644 src/main/java/org/opendata/curation/d4/signature/SignatureBlock.java delete mode 100644 src/main/java/org/opendata/curation/d4/signature/trim/CentristBlockRelevanceFilter.java rename src/main/java/org/opendata/curation/d4/signature/trim/{LiberalTrimmer.java => LiberalRobustifier.java} (54%) create mode 100644 src/main/java/org/opendata/curation/d4/signature/trim/SignatureRobustifier.java diff --git a/pom.xml b/pom.xml index 6ee1046..4eaeea0 100644 --- a/pom.xml +++ b/pom.xml @@ -70,13 +70,13 @@ true - + org.opendata.curation.d4.D4 - org.opendata.curation.d4.experiments.SignatureDropStatsExperiment + diff --git a/src/main/java/org/opendata/curation/d4/Constants.java b/src/main/java/org/opendata/curation/d4/Constants.java index b84f98b..ff8f1a2 100644 --- a/src/main/java/org/opendata/curation/d4/Constants.java +++ b/src/main/java/org/opendata/curation/d4/Constants.java @@ -26,5 +26,5 @@ public final class Constants { public static final String NAME = "D4 - Data-Driven Domain Discovery"; - public static final String VERSION = "0.29.0.dev16"; + public static final String VERSION = "0.29.0.dev18"; } \ No newline at end of file diff --git a/src/main/java/org/opendata/curation/d4/D4.java b/src/main/java/org/opendata/curation/d4/D4.java index 8481270..003cc93 100644 --- a/src/main/java/org/opendata/curation/d4/D4.java +++ b/src/main/java/org/opendata/curation/d4/D4.java @@ -35,7 +35,7 @@ import org.opendata.curation.d4.domain.DomainWriter; import org.opendata.curation.d4.domain.ExternalMemLocalDomainGenerator; import org.opendata.curation.d4.domain.StrongDomainGenerator; -import org.opendata.curation.d4.signature.SignatureBlocksGenerator; +import org.opendata.curation.d4.signature.RobustSignatureGenerator; import org.opendata.curation.d4.signature.SignatureBlocksStats; import org.opendata.core.constraint.Threshold; import org.opendata.core.io.FileListReader; @@ -48,7 +48,6 @@ import org.opendata.curation.d4.export.ExportStrongDomains; import org.opendata.curation.d4.export.PrimaryDomainWriter; import org.opendata.curation.d4.signature.SignatureBlocksReader; -import org.opendata.curation.d4.signature.SignatureBlocksStream; import org.opendata.curation.d4.signature.SignatureBlocksWriter; import org.opendata.curation.d4.signature.sketch.SignatureBlocksSizeSketchFactory; import org.opendata.curation.d4.signature.sketch.SignatureBlocksSketchFactory; @@ -60,6 +59,9 @@ import org.opendata.db.term.TermIndexGenerator; import org.opendata.db.term.TermIndexReader; import org.opendata.db.tools.Dataset2ColumnsConverter; +import org.opendata.curation.d4.signature.RobustSignatureStream; +import org.opendata.curation.d4.signature.sketch.SignatureBlocksNoSketchFactory; +import org.opendata.curation.d4.signature.trim.SignatureRobustifier; /** * Complete D4 pipeline. @@ -68,13 +70,9 @@ */ public class D4 { - private static SignatureBlocksSketchFactory SignatureBlocksNoSketchFactory() { - throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. - } - public void expandColumns( EQIndex nodeIndex, - SignatureBlocksStream signatures, + RobustSignatureStream signatures, String trimmer, SignatureBlocksSketchFactory sketchFactory, Threshold expandThreshold, @@ -145,7 +143,7 @@ public void exportStrongDomains( private static SignatureBlocksSketchFactory getSketchFactory(String spec) { if (spec == null) { - return SignatureBlocksNoSketchFactory(); + return new SignatureBlocksNoSketchFactory(); } else if ((spec.toUpperCase().startsWith("N")) && (spec.length() > 1)) { return new SignatureBlocksSizeSketchFactory(Integer.parseInt(spec.substring(1))); } @@ -214,7 +212,7 @@ public void signatures( ) throws java.lang.InterruptedException, java.io.IOException { SignatureBlocksWriter sigWriter = new SignatureBlocksWriter(outputFile); - new SignatureBlocksGenerator(telemetry).runWithMaxDrop( + new RobustSignatureGenerator(telemetry).run( nodeIndex, new ConcurrentLinkedQueue<>(nodeIndex.keys().toList()), trimmerSpec, @@ -420,9 +418,9 @@ public static void main(String[] args) { "eqs", " [default: 'compressed-term-index.txt.gz']" ), - new Parameter("trimmer", " [default: LIBERAL]"), + new Parameter("robustifier", " [default: LIBERAL]"), new Parameter("fullSignatureConstraint", " [default: true]"), - new Parameter("ignoreLastDrop", " [default: true]"), + new Parameter("ignoreLastDrop", " [default: false]"), new Parameter("ignoreMinorDrop", " [default: true]"), new Parameter("threads", " [default: 6]"), new Parameter("verbose", " [default: true]"), @@ -431,17 +429,17 @@ public static void main(String[] args) { args ); File eqFile = params.getAsFile("eqs", "compressed-term-index.txt.gz"); - String trimmerSpec = params.getAsString("trimmer", SignatureTrimmer.LIBERAL); + String robustifierSpec = params.getAsString("robustifier", SignatureRobustifier.LIBERAL); int threads = params.getAsInt("threads", 6); boolean verbose = params.getAsBool("verbose", true); File signatureFile = params.getAsFile("signatures", "signatures.txt.gz"); boolean fullSignatureConstraint = params.getAsBool("fullSignatureConstraint", true); - boolean ignoreLastDrop = params.getAsBool("ignoreLastDrop", true); + boolean ignoreLastDrop = params.getAsBool("ignoreLastDrop", false); boolean ignoreMinorDrop = params.getAsBool("ignoreMinorDrop", true); try { new D4().signatures( new EQIndex(eqFile), - trimmerSpec, + robustifierSpec, fullSignatureConstraint, ignoreLastDrop, ignoreMinorDrop, diff --git a/src/main/java/org/opendata/curation/d4/column/ColumnExpander.java b/src/main/java/org/opendata/curation/d4/column/ColumnExpander.java index aade395..34b218e 100644 --- a/src/main/java/org/opendata/curation/d4/column/ColumnExpander.java +++ b/src/main/java/org/opendata/curation/d4/column/ColumnExpander.java @@ -21,12 +21,12 @@ import java.util.ArrayList; import java.util.Date; import java.util.List; -import org.opendata.curation.d4.signature.SignatureBlocksDispatcher; -import org.opendata.curation.d4.signature.SignatureBlocksStream; +import org.opendata.curation.d4.signature.RobustSignatureDispatcher; import org.opendata.curation.d4.signature.trim.SignatureTrimmer; import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; import org.opendata.core.constraint.Threshold; import org.opendata.db.eq.EQIndex; +import org.opendata.curation.d4.signature.RobustSignatureStream; /** * Write set of expanded column equivalence classes to file. @@ -41,7 +41,7 @@ public class ColumnExpander implements Runnable { private final int _id; private final EQIndex _nodes; private final int _numberOfIterations; - private final SignatureBlocksStream _signatures; + private final RobustSignatureStream _signatures; private final Threshold _threshold; private final SignatureTrimmerFactory _trimmerFactory; @@ -49,7 +49,7 @@ public ColumnExpander( int id, EQIndex nodes, List columns, - SignatureBlocksStream signatures, + RobustSignatureStream signatures, SignatureTrimmerFactory trimmerFactory, Threshold threshold, BigDecimal decreaseFactor, @@ -73,7 +73,7 @@ public void run() { System.out.println("TASK " + _id + " EXPAND " + _columns.size() + " COLUMNS"); List expanders = new ArrayList<>(); - SignatureBlocksDispatcher dispatcher = new SignatureBlocksDispatcher(); + RobustSignatureDispatcher dispatcher = new RobustSignatureDispatcher(); for (ExpandedColumn column : _columns) { SingleColumnExpander columnExpander; @@ -112,7 +112,7 @@ public void run() { throw new RuntimeException(ex); } ArrayList active = new ArrayList<>(); - dispatcher = new SignatureBlocksDispatcher(); + dispatcher = new RobustSignatureDispatcher(); int expansionCount = 0; int expandedCount = 0; for (SingleColumnExpander expander : expanders) { diff --git a/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java b/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java index b8723e7..c18c29a 100644 --- a/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java +++ b/src/main/java/org/opendata/curation/d4/column/ParallelColumnExpander.java @@ -31,7 +31,6 @@ import java.util.logging.Logger; import org.opendata.curation.d4.telemetry.TelemetryCollector; import org.opendata.curation.d4.telemetry.TelemetryPrinter; -import org.opendata.curation.d4.signature.SignatureBlocksStream; import org.opendata.curation.d4.signature.trim.SignatureTrimmer; import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; import org.opendata.core.constraint.Threshold; @@ -39,11 +38,13 @@ import org.opendata.core.set.IDSet; import org.opendata.core.set.IdentifiableObjectSet; import org.opendata.core.util.MemUsagePrinter; +import org.opendata.curation.d4.signature.RobustSignatureConsumer; import org.opendata.curation.d4.signature.SignatureBlocksConsumer; -import org.opendata.curation.d4.signature.SignatureBlocksDispatcher; +import org.opendata.curation.d4.signature.RobustSignatureDispatcher; import org.opendata.curation.d4.signature.sketch.SignatureBlocksSketchFactory; import org.opendata.db.column.Column; import org.opendata.db.eq.EQIndex; +import org.opendata.curation.d4.signature.RobustSignatureStream; /** * Expand columns using multiple threads. Each thread expands a single columns @@ -67,7 +68,7 @@ private class ExpanderTask implements Runnable { private final int _id; private final EQIndex _nodes; private final int _numberOfIterations; - private final SignatureBlocksStream _signatures; + private final RobustSignatureStream _signatures; private final SignatureBlocksSketchFactory _sketchFactory; private final Threshold _threshold; private final SignatureTrimmerFactory _trimmerFactory; @@ -77,7 +78,7 @@ public ExpanderTask( int id, EQIndex nodes, List columns, - SignatureBlocksStream signatures, + RobustSignatureStream signatures, SignatureTrimmerFactory trimmerFactory, SignatureBlocksSketchFactory sketchFactory, Threshold threshold, @@ -121,8 +122,8 @@ public void run() { } int round = 0; while (!columns.isEmpty()) { - SignatureBlocksDispatcher dispatcher; - dispatcher = new SignatureBlocksDispatcher(); + RobustSignatureDispatcher dispatcher; + dispatcher = new RobustSignatureDispatcher(); for (SingleColumnExpander expander : columns) { SignatureTrimmer trimmer; trimmer = _trimmerFactory @@ -144,7 +145,7 @@ public void run() { ) ); } - SignatureBlocksConsumer consumer; + RobustSignatureConsumer consumer; consumer = _sketchFactory.getConsumer(dispatcher); try { _signatures.stream(consumer); @@ -196,7 +197,7 @@ public ParallelColumnExpander() { public void run( EQIndex nodes, - SignatureBlocksStream signatures, + RobustSignatureStream signatures, String trimmer, SignatureBlocksSketchFactory sketchFactory, IdentifiableObjectSet db, diff --git a/src/main/java/org/opendata/curation/d4/column/SingleColumnExpander.java b/src/main/java/org/opendata/curation/d4/column/SingleColumnExpander.java index 143d3cc..1d59080 100644 --- a/src/main/java/org/opendata/curation/d4/column/SingleColumnExpander.java +++ b/src/main/java/org/opendata/curation/d4/column/SingleColumnExpander.java @@ -23,11 +23,11 @@ import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; -import org.opendata.curation.d4.signature.SignatureBlocks; -import org.opendata.curation.d4.signature.SignatureBlocksConsumer; +import org.opendata.curation.d4.signature.RobustSignature; import org.opendata.core.constraint.Threshold; import org.opendata.core.set.HashIDSet; import org.opendata.core.set.IdentifiableObjectSet; +import org.opendata.curation.d4.signature.RobustSignatureConsumer; import org.opendata.db.eq.EQIndex; /** @@ -35,7 +35,7 @@ * * @author Heiko Mueller */ -public class SingleColumnExpander implements SignatureBlocksConsumer { +public class SingleColumnExpander implements RobustSignatureConsumer { private static final Logger LOGGER = Logger .getLogger(SingleColumnExpander.class.getName()); @@ -215,7 +215,7 @@ public void close() { } @Override - public void consume(SignatureBlocks sig) { + public void consume(RobustSignature sig) { boolean isOriginalNode = _column.isColumnNode(sig.id()); int weight = _nodeSizes[sig.id()]; diff --git a/src/main/java/org/opendata/curation/d4/domain/ExternalMemLocalDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/ExternalMemLocalDomainGenerator.java index 4462a2e..369ca38 100644 --- a/src/main/java/org/opendata/curation/d4/domain/ExternalMemLocalDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/ExternalMemLocalDomainGenerator.java @@ -28,14 +28,14 @@ import org.opendata.curation.d4.telemetry.TelemetryPrinter; import org.opendata.curation.d4.column.ExpandedColumn; import org.opendata.curation.d4.column.ExpandedColumnIndex; -import org.opendata.curation.d4.signature.SignatureBlocksConsumer; -import org.opendata.curation.d4.signature.SignatureBlocksStream; import org.opendata.curation.d4.signature.trim.SignatureTrimmer; import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; import org.opendata.core.set.MutableIdentifiableIDSet; -import org.opendata.curation.d4.signature.SignatureBlocksDispatcher; +import org.opendata.curation.d4.signature.RobustSignatureConsumer; +import org.opendata.curation.d4.signature.RobustSignatureDispatcher; import org.opendata.curation.d4.signature.sketch.SignatureBlocksSketchFactory; import org.opendata.db.eq.EQIndex; +import org.opendata.curation.d4.signature.RobustSignatureStream; /** * Generator for local domains using undirected graphs. Each connected component @@ -59,7 +59,7 @@ private class DomainGeneratorTask implements Runnable { private final UniqueDomainSet _domains; private final int _id; private final EQIndex _nodes; - private final SignatureBlocksStream _signatures; + private final RobustSignatureStream _signatures; private final SignatureBlocksSketchFactory _sketchFactory; private final SignatureTrimmerFactory _trimmerFactory; private final boolean _verbose; @@ -68,7 +68,7 @@ public DomainGeneratorTask( int id, EQIndex nodes, List columns, - SignatureBlocksStream signatures, + RobustSignatureStream signatures, SignatureTrimmerFactory trimmerFactory, SignatureBlocksSketchFactory sketchFactory, UniqueDomainSet domains, @@ -87,13 +87,13 @@ public DomainGeneratorTask( @Override public void run() { - SignatureBlocksDispatcher dispatcher; - dispatcher = new SignatureBlocksDispatcher(); + RobustSignatureDispatcher dispatcher; + dispatcher = new RobustSignatureDispatcher(); for (ExpandedColumn column : _columns) { MutableIdentifiableIDSet col; col = new MutableIdentifiableIDSet(column.id(), column.nodes()); - SignatureBlocksConsumer domainGenerator; + RobustSignatureConsumer domainGenerator; domainGenerator = new UndirectedDomainGenerator( column, _domains, @@ -137,7 +137,7 @@ public ExternalMemLocalDomainGenerator() { public void run( EQIndex nodes, ExpandedColumnIndex columnIndex, - SignatureBlocksStream signatures, + RobustSignatureStream signatures, String trimmer, SignatureBlocksSketchFactory sketchFactory, boolean originalOnly, diff --git a/src/main/java/org/opendata/curation/d4/domain/InMemLocalDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/InMemLocalDomainGenerator.java index e64f598..80e2e6c 100644 --- a/src/main/java/org/opendata/curation/d4/domain/InMemLocalDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/InMemLocalDomainGenerator.java @@ -29,14 +29,14 @@ import org.opendata.curation.d4.telemetry.TelemetryPrinter; import org.opendata.curation.d4.column.ExpandedColumn; import org.opendata.curation.d4.column.ExpandedColumnIndex; -import org.opendata.curation.d4.signature.SignatureBlocksConsumer; -import org.opendata.curation.d4.signature.SignatureBlocksStream; import org.opendata.curation.d4.signature.trim.SignatureTrimmer; import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; import org.opendata.core.set.MutableIdentifiableIDSet; import org.opendata.core.util.MemUsagePrinter; +import org.opendata.curation.d4.signature.RobustSignatureConsumer; import org.opendata.curation.d4.signature.sketch.SignatureBlocksSketchFactory; import org.opendata.db.eq.EQIndex; +import org.opendata.curation.d4.signature.RobustSignatureStream; /** * Generator for local domains using undirected graphs. Each connected component @@ -58,7 +58,7 @@ private class DomainGeneratorTask implements Runnable { private final UniqueDomainSet _domains; private final int _id; private final EQIndex _nodes; - private final SignatureBlocksStream _signatures; + private final RobustSignatureStream _signatures; private final SignatureBlocksSketchFactory _sketchFactory; private final SignatureTrimmerFactory _trimmerFactory; private final boolean _verbose; @@ -67,7 +67,7 @@ public DomainGeneratorTask( int id, EQIndex nodes, ConcurrentLinkedQueue columns, - SignatureBlocksStream signatures, + RobustSignatureStream signatures, SignatureTrimmerFactory trimmerFactory, SignatureBlocksSketchFactory sketchFactory, UniqueDomainSet domains, @@ -92,7 +92,7 @@ public void run() { while ((column = _columns.poll()) != null) { MutableIdentifiableIDSet col; col = new MutableIdentifiableIDSet(column.id(), column.nodes()); - SignatureBlocksConsumer domainGenerator; + RobustSignatureConsumer domainGenerator; domainGenerator = new UndirectedDomainGenerator( column, _domains, @@ -135,7 +135,7 @@ public InMemLocalDomainGenerator() { public void run( EQIndex nodes, ExpandedColumnIndex columnIndex, - SignatureBlocksStream signatures, + RobustSignatureStream signatures, String trimmer, SignatureBlocksSketchFactory sketchFactory, boolean originalOnly, diff --git a/src/main/java/org/opendata/curation/d4/domain/UndirectedDomainGenerator.java b/src/main/java/org/opendata/curation/d4/domain/UndirectedDomainGenerator.java index 088e65c..08bf3ff 100644 --- a/src/main/java/org/opendata/curation/d4/domain/UndirectedDomainGenerator.java +++ b/src/main/java/org/opendata/curation/d4/domain/UndirectedDomainGenerator.java @@ -18,10 +18,10 @@ package org.opendata.curation.d4.domain; import org.opendata.curation.d4.column.ExpandedColumn; -import org.opendata.curation.d4.signature.SignatureBlocks; -import org.opendata.curation.d4.signature.SignatureBlocksConsumer; +import org.opendata.curation.d4.signature.RobustSignature; import org.opendata.core.graph.UndirectedConnectedComponents; import org.opendata.core.set.IdentifiableIDSet; +import org.opendata.curation.d4.signature.RobustSignatureConsumer; /** * Generator for local domains in an expanded column. Domains are generated as @@ -30,7 +30,7 @@ * * @author Heiko Mueller */ -public class UndirectedDomainGenerator extends UndirectedConnectedComponents implements SignatureBlocksConsumer { +public class UndirectedDomainGenerator extends UndirectedConnectedComponents implements RobustSignatureConsumer { private final ExpandedColumn _column; private boolean _isDone = false; @@ -66,8 +66,12 @@ public void close() { } @Override - public void consume(SignatureBlocks sig) { + public void consume(RobustSignature sig) { + if (_isDone) { + return; + } + final int sigId = sig.id(); if (_column.contains(sigId)) { @@ -83,12 +87,6 @@ public void consume(SignatureBlocks sig) { } } } - - @Override - public boolean isDone() { - - return _isDone; - } @Override public void open() { diff --git a/src/main/java/org/opendata/curation/d4/signature/ConcurrentSignatureBlocksStream.java b/src/main/java/org/opendata/curation/d4/signature/ConcurrentSignatureBlocksStream.java deleted file mode 100644 index 6d249e6..0000000 --- a/src/main/java/org/opendata/curation/d4/signature/ConcurrentSignatureBlocksStream.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * This file is part of the Data-Driven Domain Discovery Tool (D4). - * - * Copyright (c) 2018-2020 New York University. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.opendata.curation.d4.signature; - -import java.io.BufferedReader; -import java.io.File; -import java.math.BigDecimal; -import java.util.LinkedList; -import org.opendata.core.io.FileSetReader; -import org.opendata.core.io.FileSystem; - -/** - * Reader for a signature blocks file. Generates a stream of signature blocks - * for a given consumer. - * - * @author Heiko Mueller - */ -public class ConcurrentSignatureBlocksStream extends FileSetReader { - - private final LinkedList _files; - private BufferedReader _in = null; - - public ConcurrentSignatureBlocksStream(File file, boolean verbose) throws java.io.IOException { - - super(file, verbose); - - _files = new LinkedList<>(); - for (File inputFile : this) { - _files.add(inputFile); - } - - if (!_files.isEmpty()) { - _in = FileSystem.openReader(_files.pop()); - } - } - - public ConcurrentSignatureBlocksStream(File file) throws java.io.IOException { - - this(file, false); - } - - public synchronized SignatureBlocks next() throws java.io.IOException { - - while (_in != null) { - String line = _in.readLine(); - if (line != null) { - String[] tokens = line.split("\t"); - int[][] blocks = new int[tokens.length - 2][]; - for (int iToken = 2; iToken < tokens.length; iToken++) { - blocks[iToken - 2] = SignatureBlocksReader. - getBlockNodes(tokens[iToken]); - } - return new SignatureBlocksImpl( - Integer.parseInt(tokens[0]), - new BigDecimal(tokens[1]), - blocks - ); - } else { - _in.close(); - _in = null; - if (!_files.isEmpty()) { - _in = FileSystem.openReader(_files.pop()); - } - } - } - - return null; - } -} diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocks.java b/src/main/java/org/opendata/curation/d4/signature/RobustSignature.java similarity index 68% rename from src/main/java/org/opendata/curation/d4/signature/SignatureBlocks.java rename to src/main/java/org/opendata/curation/d4/signature/RobustSignature.java index 45c0a3a..af4a8df 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocks.java +++ b/src/main/java/org/opendata/curation/d4/signature/RobustSignature.java @@ -17,25 +17,22 @@ */ package org.opendata.curation.d4.signature; -import java.math.BigDecimal; import org.opendata.core.object.IdentifiableObjectImpl; /** - * List of blocks for a context signature. Contains the node identifier and the - * similarity of the first entry in the context signature. + * List of blocks for a robust context signature. The robust signature contains + * on;y the node identifier for each block but no similarity statistics. * * @author Heiko Mueller */ -public abstract class SignatureBlocks extends IdentifiableObjectImpl { +public abstract class RobustSignature extends IdentifiableObjectImpl { - private final BigDecimal _maxSim; private final int _size; - public SignatureBlocks(int id, BigDecimal maxSim, int size) { + public RobustSignature(int id, int size) { super(id); - _maxSim = maxSim; _size = size; } @@ -57,18 +54,6 @@ public boolean isEmpty() { return (_size == 0); } - /** - * Similarity of the first entry in the signature. This is the similarity - * of the most similar term for the equivalence class that is represented - * by this signature. - * - * @return - */ - public BigDecimal maxSim() { - - return _maxSim; - } - /** * Number of blocks in the signature. * diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksBuffer.java b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureBuffer.java similarity index 73% rename from src/main/java/org/opendata/curation/d4/signature/SignatureBlocksBuffer.java rename to src/main/java/org/opendata/curation/d4/signature/RobustSignatureBuffer.java index 7319ce4..d192e81 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksBuffer.java +++ b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureBuffer.java @@ -26,12 +26,12 @@ * * @author Heiko Mueller */ -public class SignatureBlocksBuffer implements Iterable, SignatureBlocksConsumer, SignatureBlocksStream { +public class RobustSignatureBuffer implements Iterable, RobustSignatureConsumer, RobustSignatureStream { - private final List _signatures = new ArrayList<>(); + private final List _signatures = new ArrayList<>(); private final String _source; - public SignatureBlocksBuffer(String source) { + public RobustSignatureBuffer(String source) { _source = source; } @@ -42,24 +42,18 @@ public void close() { } @Override - public void consume(SignatureBlocks sig) { + public void consume(RobustSignature sig) { _signatures.add(sig); } - public SignatureBlocks get(int index) { + public RobustSignature get(int index) { return _signatures.get(index); } - - @Override - public boolean isDone() { - - return false; - } @Override - public Iterator iterator() { + public Iterator iterator() { return _signatures.iterator(); } @@ -76,11 +70,11 @@ public int size() { } @Override - public void stream(SignatureBlocksConsumer consumer) { + public void stream(RobustSignatureConsumer consumer) { consumer.open(); - for (SignatureBlocks sig : _signatures) { + for (RobustSignature sig : _signatures) { consumer.consume(sig); } diff --git a/src/main/java/org/opendata/curation/d4/signature/RobustSignatureConsumer.java b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureConsumer.java new file mode 100644 index 0000000..f272124 --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureConsumer.java @@ -0,0 +1,32 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.signature; + +import java.util.List; + +/** + * Consumer for a stream of robust signature blocks. + * + * @author Heiko Mueller + */ +public interface RobustSignatureConsumer { + + public void close(); + public void consume(RobustSignature sig); + public void open(); +} diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksDispatcher.java b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureDispatcher.java similarity index 64% rename from src/main/java/org/opendata/curation/d4/signature/SignatureBlocksDispatcher.java rename to src/main/java/org/opendata/curation/d4/signature/RobustSignatureDispatcher.java index 5398e26..c7c314c 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksDispatcher.java +++ b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureDispatcher.java @@ -26,21 +26,21 @@ * * @author Heiko Mueller */ -public class SignatureBlocksDispatcher implements SignatureBlocksConsumer { +public class RobustSignatureDispatcher implements RobustSignatureConsumer { - private final List _consumers; + private final List _consumers; - public SignatureBlocksDispatcher(List consumers) { + public RobustSignatureDispatcher(List consumers) { _consumers = consumers; } - public SignatureBlocksDispatcher() { + public RobustSignatureDispatcher() { this(new ArrayList<>()); } - public void add(SignatureBlocksConsumer consumer) { + public void add(RobustSignatureConsumer consumer) { _consumers.add(consumer); } @@ -48,34 +48,23 @@ public void add(SignatureBlocksConsumer consumer) { @Override public void close() { - for (SignatureBlocksConsumer consumer : _consumers) { + for (RobustSignatureConsumer consumer : _consumers) { consumer.close(); } } @Override - public void consume(SignatureBlocks sig) { + public void consume(RobustSignature sig) { - for (SignatureBlocksConsumer consumer : _consumers) { + for (RobustSignatureConsumer consumer : _consumers) { consumer.consume(sig); } } - @Override - public boolean isDone() { - - for (SignatureBlocksConsumer consumer : _consumers) { - if (consumer.isDone()) { - return true; - } - } - return false; - } - @Override public void open() { - for (SignatureBlocksConsumer consumer : _consumers) { + for (RobustSignatureConsumer consumer : _consumers) { consumer.open(); } } diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureGenerator.java similarity index 65% rename from src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java rename to src/main/java/org/opendata/curation/d4/signature/RobustSignatureGenerator.java index 36bec46..88c3aed 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java +++ b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureGenerator.java @@ -29,36 +29,31 @@ import org.opendata.curation.d4.telemetry.TelemetryCollector; import org.opendata.curation.d4.telemetry.TelemetryPrinter; import org.opendata.core.constraint.GreaterThanConstraint; -import org.opendata.core.constraint.Threshold; import org.opendata.core.prune.CandidateSetFinder; import org.opendata.core.prune.MaxDropFinder; -import org.opendata.core.prune.ThresholdFinder; -import org.opendata.curation.d4.signature.trim.SignatureTrimmer; -import org.opendata.curation.d4.signature.trim.SignatureTrimmerFactory; +import org.opendata.curation.d4.signature.trim.ColumnSupportBlockFilter; +import org.opendata.curation.d4.signature.trim.LiberalRobustifier; +import org.opendata.curation.d4.signature.trim.SignatureRobustifier; import org.opendata.db.eq.EQIndex; /** - * Generate output file containing context signature blocks. + * Generate output file containing robust context signature blocks. * * The output contains a single tab-delimited line for each equivalence class * containing the following information: * * - equivalence class identifier - * - similarity of first context signature entry - * - list of signature blocks. Each block is a comma-separated list of node - * identifier. Blocks are separated by a tab. + * - list of signature blocks. Each block is prefixed by the similarity of the + * first and last element (delimited by '-') and a a comma-separated list of + * node identifier (delimited from the prefix by ':'). Blocks are separated + * by a tab. * * @author Heiko Mueller */ -public class SignatureBlocksGenerator { +public class RobustSignatureGenerator { - public static final String MAX_DROP = "MAX-DROP"; - public static final String THRESHOLD = "THRESHOLD"; - public static final String TELEMETRY_ID = "SIGNATURE BLOCKS"; - public static final String DEFAULT = MAX_DROP; - /** * Worker for generating signature blocks for equivalence classes. * @@ -86,14 +81,18 @@ public BlockGeneratorTask( _consumer = consumer; } - private int[] getBlock(List sig, int start, int end) { + private SignatureBlock getBlock(List sig, int start, int end) { int[] block = new int[end - start]; for (int iEl = start; iEl < end; iEl++) { block[iEl - start] = sig.get(iEl).id(); } Arrays.sort(block); - return block; + return new SignatureBlock( + block, + sig.get(start).value(), + sig.get(end - 1).value() + ); } @Override @@ -107,7 +106,7 @@ public void run() { if (sig.isEmpty()) { continue; } - ArrayList blocks = new ArrayList<>(); + ArrayList blocks = new ArrayList<>(); int start = 0; final int end = sig.size(); while (start < end) { @@ -144,13 +143,7 @@ public void run() { start = pruneIndex; } if (!blocks.isEmpty()) { - _consumer.consume( - new SignatureBlocksImpl( - nodeId, - sig.get(0).toBigDecimal(), - blocks - ) - ); + _consumer.consume(nodeId, blocks); } } } @@ -158,105 +151,22 @@ public void run() { private final TelemetryCollector _telemetry; - public SignatureBlocksGenerator(TelemetryCollector telemetry) { + public RobustSignatureGenerator(TelemetryCollector telemetry) { _telemetry = telemetry; } - public SignatureBlocksGenerator() { + public RobustSignatureGenerator() { this(new TelemetryPrinter()); } - private void compute( - ContextSignatureGenerator sigFact, - ConcurrentLinkedQueue queue, - CandidateSetFinder candidateFinder, - boolean ignoreMinorDrop, - int threads, - boolean verbose, - SignatureBlocksConsumer consumer - ) throws java.lang.InterruptedException, java.io.IOException { - - Date start = new Date(); - if (verbose) { - System.out.println("START @ " + start); - } - - consumer.open(); - - ExecutorService es = Executors.newCachedThreadPool(); - for (int iThread = 0; iThread < threads; iThread++) { - es.execute( - new BlockGeneratorTask( - queue, - sigFact, - candidateFinder, - ignoreMinorDrop, - consumer - ) - ); - } - es.shutdown(); - try { - es.awaitTermination(threads, TimeUnit.DAYS); - } catch (java.lang.InterruptedException ex) { - throw new RuntimeException(ex); - } - - consumer.close(); - - Date end = new Date(); - if (verbose) { - System.out.println("END @ " + end); - } - - if (verbose) { - long execTime = end.getTime() - start.getTime(); - _telemetry.add(TELEMETRY_ID, execTime); - } - } - /** - * Generate signature blocks using a fixed threshold constraint. - * - * @param eqIndex - * @param queue - * @param threshold - * @param threads - * @param verbose - * @param consumer - * @throws java.lang.InterruptedException - * @throws java.io.IOException - */ - public void runWithThreshold( - EQIndex eqIndex, - ConcurrentLinkedQueue queue, - Threshold threshold, - int threads, - boolean verbose, - SignatureBlocksConsumer consumer - ) throws java.lang.InterruptedException, java.io.IOException { - - ThresholdFinder candidateFinder; - candidateFinder = new ThresholdFinder<>(threshold); - - this.compute( - new ContextSignatureGenerator(eqIndex.nodes()), - queue, - candidateFinder, - false, - threads, - verbose, - consumer - ); - } - /** * Generate signature blocks using consecutive steepest drops. * * @param eqIndex * @param queue - * @param trimmerSpec + * @param robustifierSpec * @param fullSignatureConstraint * @param ignoreLastDrop * @param ignoreMinorDrop @@ -266,10 +176,10 @@ public void runWithThreshold( * @throws java.lang.InterruptedException * @throws java.io.IOException */ - public void runWithMaxDrop( + public void run( EQIndex eqIndex, ConcurrentLinkedQueue queue, - String trimmerSpec, + String robustifierSpec, boolean fullSignatureConstraint, boolean ignoreLastDrop, boolean ignoreMinorDrop, @@ -283,7 +193,7 @@ public void runWithMaxDrop( String.format( "SIGNATURE BLOCKS FOR %d EQs USING:\n" + " --eqs=%s\n" + - " --trimmer=%s\n" + + " --robustifier=%s\n" + " --fullSignatureConstraint=%s\n" + " --ignoreLastDrop=%s\n" + " --ignoreMinorDrop=%s\n" + @@ -291,7 +201,7 @@ public void runWithMaxDrop( " --signatures=%s", queue.size(), eqIndex.source(), - trimmerSpec, + robustifierSpec, Boolean.toString(fullSignatureConstraint), Boolean.toString(ignoreLastDrop), Boolean.toString(ignoreMinorDrop), @@ -309,57 +219,56 @@ public void runWithMaxDrop( ignoreLastDrop ); - SignatureTrimmer trimmer; - trimmer = new SignatureTrimmerFactory(eqIndex, eqIndex.columns(), trimmerSpec) - .getTrimmer(writer); + SignatureRobustifier consumer; + if (robustifierSpec.equalsIgnoreCase(SignatureRobustifier.COLSUPP)) { + consumer = new ColumnSupportBlockFilter(eqIndex, writer); + } else if (robustifierSpec.equalsIgnoreCase(SignatureRobustifier.LIBERAL)) { + consumer = new LiberalRobustifier(eqIndex.nodeSizes(), writer); + } else { + throw new IllegalArgumentException( + String.format("Unknown robustifier '%s'", robustifierSpec) + ); + } - this.compute( - new ContextSignatureGenerator(eqIndex.nodes()), - queue, - candidateFinder, - ignoreMinorDrop, - threads, - verbose, - trimmer - ); - } - - /** - * Generate signature blocks using consecutive steepest drops. - * - * @param eqIndex - * @param queue - * @param ignoreMinorDrop - * @param threads - * @param verbose - * @param consumer - * @throws java.lang.InterruptedException - * @throws java.io.IOException - */ - public void runWithMaxDrop( - EQIndex eqIndex, - ConcurrentLinkedQueue queue, - boolean ignoreMinorDrop, - int threads, - boolean verbose, - SignatureBlocksConsumer consumer - ) throws java.lang.InterruptedException, java.io.IOException { - - MaxDropFinder candidateFinder; - candidateFinder = new MaxDropFinder<>( - new GreaterThanConstraint(BigDecimal.ZERO), - false, - true - ); + ContextSignatureGenerator sigFact; + sigFact = new ContextSignatureGenerator(eqIndex.nodes()); - this.compute( - new ContextSignatureGenerator(eqIndex.nodes()), - queue, - candidateFinder, - ignoreMinorDrop, - threads, - verbose, - consumer - ); + Date start = new Date(); + if (verbose) { + System.out.println("START @ " + start); + } + + consumer.open(); + + ExecutorService es = Executors.newCachedThreadPool(); + for (int iThread = 0; iThread < threads; iThread++) { + es.execute( + new BlockGeneratorTask( + queue, + sigFact, + candidateFinder, + ignoreMinorDrop, + consumer + ) + ); + } + es.shutdown(); + try { + es.awaitTermination(threads, TimeUnit.DAYS); + } catch (java.lang.InterruptedException ex) { + throw new RuntimeException(ex); + } + + consumer.close(); + + Date end = new Date(); + if (verbose) { + System.out.println("END @ " + end); + } + + if (verbose) { + long execTime = end.getTime() - start.getTime(); + _telemetry.add(TELEMETRY_ID, execTime); + } } } diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksImpl.java b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureImpl.java similarity index 79% rename from src/main/java/org/opendata/curation/d4/signature/SignatureBlocksImpl.java rename to src/main/java/org/opendata/curation/d4/signature/RobustSignatureImpl.java index 7622ed8..7c5c566 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksImpl.java +++ b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureImpl.java @@ -17,7 +17,6 @@ */ package org.opendata.curation.d4.signature; -import java.math.BigDecimal; import java.util.List; /** @@ -25,13 +24,13 @@ * * @author Heiko Mueller */ -public class SignatureBlocksImpl extends SignatureBlocks { +public class RobustSignatureImpl extends RobustSignature { private final int[][] _blocks; - public SignatureBlocksImpl(int id, BigDecimal maxSim, List blocks) { + public RobustSignatureImpl(int id, List blocks) { - super(id, maxSim, blocks.size()); + super(id, blocks.size()); _blocks = new int[blocks.size()][]; for (int iBlock = 0; iBlock < blocks.size(); iBlock++) { @@ -39,9 +38,9 @@ public SignatureBlocksImpl(int id, BigDecimal maxSim, List blocks) { } } - public SignatureBlocksImpl(int id, BigDecimal maxSim, int[][] blocks) { + public RobustSignatureImpl(int id, int[][] blocks) { - super(id, maxSim, blocks.length); + super(id, blocks.length); _blocks = blocks; } diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndex.java b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureIndex.java similarity index 72% rename from src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndex.java rename to src/main/java/org/opendata/curation/d4/signature/RobustSignatureIndex.java index dd9dc58..84761a4 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksIndex.java +++ b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureIndex.java @@ -27,12 +27,12 @@ * * @author Heiko Mueller */ -public class SignatureBlocksIndex implements Iterable, SignatureBlocksConsumer, SignatureBlocksStream { +public class RobustSignatureIndex implements Iterable, RobustSignatureConsumer, RobustSignatureStream { - private final HashObjectSet _signatures = new HashObjectSet<>(); + private final HashObjectSet _signatures = new HashObjectSet<>(); private final String _source; - public SignatureBlocksIndex(String source) { + public RobustSignatureIndex(String source) { _source = source; } @@ -48,24 +48,18 @@ public void close() { } @Override - public synchronized void consume(SignatureBlocks sig) { + public synchronized void consume(RobustSignature sig) { _signatures.add(sig); } - public SignatureBlocks get(int id) { + public RobustSignature get(int id) { return _signatures.get(id); } - - @Override - public boolean isDone() { - - return false; - } @Override - public Iterator iterator() { + public Iterator iterator() { return _signatures.iterator(); } @@ -81,21 +75,18 @@ public void open() { } @Override - public void stream(SignatureBlocksConsumer consumer) { + public void stream(RobustSignatureConsumer consumer) { consumer.open(); - for (SignatureBlocks sig : _signatures) { + for (RobustSignature sig : _signatures) { consumer.consume(sig); - if (consumer.isDone()) { - break; - } } consumer.close(); } - public void stream(SignatureBlocksConsumer consumer, IDSet filter) { + public void stream(RobustSignatureConsumer consumer, IDSet filter) { consumer.open(); @@ -114,7 +105,7 @@ public String source() { return _source; } - public List toList() { + public List toList() { return _signatures.toList(); } diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStream.java b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureStream.java similarity index 87% rename from src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStream.java rename to src/main/java/org/opendata/curation/d4/signature/RobustSignatureStream.java index f55c09a..42e0ec4 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStream.java +++ b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureStream.java @@ -24,7 +24,7 @@ * * @author Heiko Mueller */ -public interface SignatureBlocksStream extends DataCollection { +public interface RobustSignatureStream extends DataCollection { - public void stream(SignatureBlocksConsumer consumer) throws java.io.IOException; + public void stream(RobustSignatureConsumer consumer) throws java.io.IOException; } diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlock.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlock.java new file mode 100644 index 0000000..2fa803a --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlock.java @@ -0,0 +1,59 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.signature; + +/** + * Block in the robust signature for an equivalence class. Each block maintains + * the similarity for the first and last entry in the blocks as well as the list + * of all node identifier for the equivalence classes in the block. + * + * @author Heiko Mueller + */ +public class SignatureBlock { + + private final int[] _elements; + private final double _firstValue; + private final double _lastValue; + + public SignatureBlock(int[] elements, double firstValue, double lastValue) { + + _elements = elements; + _firstValue = firstValue; + _lastValue = lastValue; + } + + public int[] elements() { + + return _elements; + } + + public double firstValue() { + + return _firstValue; + } + + public double lastValue() { + + return _lastValue; + } + + public int length() { + + return _elements.length; + } +} diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumer.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumer.java index 8cfbf85..9b629d8 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumer.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksConsumer.java @@ -17,6 +17,8 @@ */ package org.opendata.curation.d4.signature; +import java.util.List; + /** * Consumer for a stream of signature blocks. * @@ -25,7 +27,6 @@ public interface SignatureBlocksConsumer { public void close(); - public void consume(SignatureBlocks sig); - public boolean isDone(); + public void consume(int nodeId, List blocks); public void open(); } diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksReader.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksReader.java index e06ca09..c162462 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksReader.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksReader.java @@ -19,7 +19,7 @@ import java.io.BufferedReader; import java.io.File; -import java.math.BigDecimal; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.opendata.core.io.FileSetReader; @@ -31,7 +31,7 @@ * * @author Heiko Mueller */ -public class SignatureBlocksReader extends FileSetReader implements SignatureBlocksStream { +public class SignatureBlocksReader extends FileSetReader implements RobustSignatureStream { private final File _file; @@ -54,31 +54,36 @@ public SignatureBlocksReader(List files, File directory) { _file = directory; } - public static int[] getBlockNodes(String text) { + private SignatureBlock getBlock(String text) { - String[] tokens = text.split(","); + String[] tokens = text.substring(0, text.indexOf(":")).split("-"); + return new SignatureBlock( + this.getBlockNodes(text), + Double.parseDouble(tokens[0]), + Double.parseDouble(tokens[1]) + ); + } + + private int[] getBlockNodes(String text) { + + String[] tokens = text.substring(text.indexOf(":") + 1).split(","); int[] nodes = new int[tokens.length]; for (int iToken = 0; iToken < tokens.length; iToken++) { - String val = tokens[iToken]; - int pos = val.indexOf(":"); - if (pos != -1) { - val = val.substring(0, pos); - } - nodes[iToken] = Integer.parseInt(val); + nodes[iToken] = Integer.parseInt(tokens[iToken]); } Arrays.sort(nodes); return nodes; } - public SignatureBlocksIndex read() throws java.io.IOException { + public RobustSignatureIndex read() throws java.io.IOException { - SignatureBlocksIndex buffer = new SignatureBlocksIndex(this.source()); + RobustSignatureIndex buffer = new RobustSignatureIndex(this.source()); this.stream(buffer); return buffer; } @Override - public void stream(SignatureBlocksConsumer consumer) throws java.io.IOException { + public void stream(RobustSignatureConsumer consumer) throws java.io.IOException { consumer.open(); @@ -87,13 +92,12 @@ public void stream(SignatureBlocksConsumer consumer) throws java.io.IOException String line; while ((line = in.readLine()) != null) { String[] tokens = line.split("\t"); - int[][] blocks = new int[tokens.length - 2][]; - for (int iToken = 2; iToken < tokens.length; iToken++) { - blocks[iToken - 2] = this.getBlockNodes(tokens[iToken]); + int[][] blocks = new int[tokens.length - 1][]; + for (int iToken = 1; iToken < tokens.length; iToken++) { + blocks[iToken - 1] = this.getBlockNodes(tokens[iToken]); } - SignatureBlocks sig = new SignatureBlocksImpl( + RobustSignature sig = new RobustSignatureImpl( Integer.parseInt(tokens[0]), - new BigDecimal(tokens[1]), blocks ); consumer.consume(sig); @@ -104,6 +108,27 @@ public void stream(SignatureBlocksConsumer consumer) throws java.io.IOException consumer.close(); } + public void stream(SignatureBlocksConsumer consumer) throws java.io.IOException { + + consumer.open(); + + for (File file : this) { + try (BufferedReader in = FileSystem.openReader(file)) { + String line; + while ((line = in.readLine()) != null) { + String[] tokens = line.split("\t"); + ArrayList blocks = new ArrayList<>(); + for (int iToken = 1; iToken < tokens.length; iToken++) { + blocks.add(this.getBlock(tokens[iToken])); + } + consumer.consume(Integer.parseInt(tokens[0]), blocks); + } + } + } + + consumer.close(); + } + @Override public String source() { diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStats.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStats.java index 82de5d3..a426ced 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStats.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksStats.java @@ -19,6 +19,7 @@ import java.io.File; import java.io.PrintWriter; +import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import org.opendata.core.util.Avg; @@ -79,8 +80,10 @@ public long sum() { } private StatsCollector _blockStats; - private SimilarityHistogram _histogram = null; - private StatsCollector[] _nodeStats; + private SimilarityHistogram _firstDropHistogram = null; + private SimilarityHistogram _lastDropHistogram = null; + private StatsCollector _nodeStats; + private SimilarityHistogram _similarityHistogram = null; @Override public void close() { @@ -88,63 +91,60 @@ public void close() { } @Override - public void consume(SignatureBlocks sig) { + public void consume(int nodeId, List blocks) { - _blockStats.add(sig.size()); - _histogram.add(sig.maxSim()); + _blockStats.add(blocks.size()); + _firstDropHistogram.add(blocks.get(0).lastValue()); + _lastDropHistogram.add(blocks.get(blocks.size() - 1).lastValue()); + _similarityHistogram.add(blocks.get(0).firstValue()); - int[] nodeCount = new int[11]; - for (int iBlock = 0; iBlock < sig.size(); iBlock++) { - int bl = sig.get(iBlock).length; - for (int i = 0; i < 10; i++) { - nodeCount[i] += Math.min(bl, (i + 1) * 10); - } - nodeCount[10] += bl; - } - - for (int i = 0; i < nodeCount.length; i++) { - _nodeStats[i].add(nodeCount[i]); + int nodeCount = 0; + for (int iBlock = 0; iBlock < blocks.size(); iBlock++) { + nodeCount += blocks.get(iBlock).length(); } - } - - @Override - public boolean isDone() { - return false; + _nodeStats.add(nodeCount); } @Override public void open() { _blockStats = new StatsCollector(); - _histogram = new SimilarityHistogram(); - _nodeStats = new StatsCollector[11]; - for (int i = 0; i < _nodeStats.length; i++) { - _nodeStats[i] = new StatsCollector(); - } + _firstDropHistogram = new SimilarityHistogram(); + _lastDropHistogram = new SimilarityHistogram(); + _similarityHistogram = new SimilarityHistogram(); + _nodeStats = new StatsCollector(); } public void print(PrintWriter out) { + out.println("SIMILARITIES:"); + out.println("\tMAX. SIM\tFIRST DROP\tLAST DROP"); + for (String key : _similarityHistogram.keys()) { + out.println( + String.format( + "%s\t%d\t%d\t%d", + key, + _similarityHistogram.get(key), + _firstDropHistogram.get(key), + _lastDropHistogram.get(key) + ) + ); + } + out.println(); out.println("SIGNATURE COUNT: " + _blockStats.count()); out.println(); out.println("SIGNATURE BLOCKS"); out.println("MIN. SIZE : " + _blockStats.min()); out.println("MAX. SIZE : " + _blockStats.max()); out.println("AVG. SIZE : " + _blockStats.avg()); + out.println("SUM : " + _blockStats.sum()); out.println(); - for (int i = 0; i < _nodeStats.length - 1; i++) { - out.println("NODE COUNTS " + ((i + 1) * 10)); - out.println("MIN. SIZE : " + _nodeStats[i].min()); - out.println("MAX. SIZE : " + _nodeStats[i].max()); - out.println("AVG. SIZE : " + _nodeStats[i].avg()); - out.println("SUM : " + _nodeStats[i].sum()); - } - out.println("NODE COUNTS (TOTAL)"); - out.println("MIN. SIZE : " + _nodeStats[10].min()); - out.println("MAX. SIZE : " + _nodeStats[10].max()); - out.println("AVG. SIZE : " + _nodeStats[10].avg()); - out.println("SUM : " + _nodeStats[10].sum()); + out.println("NODE COUNTS"); + out.println("MIN. SIZE : " + _nodeStats.min()); + out.println("MAX. SIZE : " + _nodeStats.max()); + out.println("AVG. SIZE : " + _nodeStats.avg()); + out.println("SUM : " + _nodeStats.sum()); out.flush(); } diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriter.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriter.java index 66e3b62..8f8512a 100644 --- a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriter.java +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksWriter.java @@ -19,9 +19,9 @@ import java.io.File; import java.io.PrintWriter; +import java.util.List; import org.opendata.core.io.FileSystem; import org.opendata.core.io.prov.DataSink; -import org.opendata.core.util.FormatedBigDecimal; import org.opendata.core.util.StringHelper; /** @@ -50,23 +50,22 @@ public synchronized void close() { } @Override - public void consume(SignatureBlocks sig) { + public void consume(int nodeId, List blocks) { - String line = sig.id() + "\t" + new FormatedBigDecimal(sig.maxSim()).toString(); - for (int iBlock = 0; iBlock < sig.size(); iBlock++) { - line += "\t" + StringHelper.joinIntegers(sig.get(iBlock)); + String line = Integer.toString(nodeId); + for (SignatureBlock block : blocks) { + line += String.format( + "\t%.6f-%.6f:%s", + block.firstValue(), + block.lastValue(), + StringHelper.joinIntegers(block.elements()) + ); } synchronized(this) { _out.println(line); } } - @Override - public boolean isDone() { - - return false; - } - @Override public synchronized void open() { @@ -85,9 +84,4 @@ public String target() { return _file.getName(); } - - public void write(SignatureBlocksIndex signatures) { - - signatures.stream(this); - } } diff --git a/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksNoSketchFactory.java b/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksNoSketchFactory.java index 47ebde1..851a2c9 100644 --- a/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksNoSketchFactory.java +++ b/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksNoSketchFactory.java @@ -17,7 +17,7 @@ */ package org.opendata.curation.d4.signature.sketch; -import org.opendata.curation.d4.signature.SignatureBlocksConsumer; +import org.opendata.curation.d4.signature.RobustSignatureConsumer; /** * Factory pattern for consumer that implement the no-sketch strategy for @@ -28,7 +28,7 @@ public class SignatureBlocksNoSketchFactory implements SignatureBlocksSketchFactory { @Override - public SignatureBlocksConsumer getConsumer(SignatureBlocksConsumer consumer) { + public RobustSignatureConsumer getConsumer(RobustSignatureConsumer consumer) { return consumer; } @@ -37,5 +37,5 @@ public SignatureBlocksConsumer getConsumer(SignatureBlocksConsumer consumer) { public String toDocString() { return "null"; - } + } } diff --git a/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSizeSketch.java b/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSizeSketch.java index 2b822a9..eb132c5 100644 --- a/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSizeSketch.java +++ b/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSizeSketch.java @@ -19,9 +19,10 @@ import java.util.ArrayList; import java.util.List; -import org.opendata.curation.d4.signature.SignatureBlocks; +import org.opendata.curation.d4.signature.RobustSignature; +import org.opendata.curation.d4.signature.RobustSignatureConsumer; import org.opendata.curation.d4.signature.SignatureBlocksConsumer; -import org.opendata.curation.d4.signature.SignatureBlocksImpl; +import org.opendata.curation.d4.signature.RobustSignatureImpl; /** * Consumer that prunes individual blocks in a signature. The size sketch @@ -33,12 +34,12 @@ * * @author Heiko Mueller */ -public class SignatureBlocksSizeSketch implements SignatureBlocksConsumer { +public class SignatureBlocksSizeSketch implements RobustSignatureConsumer { - private final SignatureBlocksConsumer _consumer; + private final RobustSignatureConsumer _consumer; public final int _n; - public SignatureBlocksSizeSketch(int n, SignatureBlocksConsumer consumer) { + public SignatureBlocksSizeSketch(int n, RobustSignatureConsumer consumer) { _n = n; _consumer = consumer; @@ -52,7 +53,7 @@ public void close() { } @Override - public void consume(SignatureBlocks sig) { + public void consume(RobustSignature sig) { List blocks = new ArrayList<>(); for (int iBlock = 0; iBlock < sig.size(); iBlock++) { @@ -64,13 +65,7 @@ public void consume(SignatureBlocks sig) { } blocks.add(block); } - _consumer.consume(new SignatureBlocksImpl(sig.id(), sig.maxSim(), blocks)); - } - - @Override - public boolean isDone() { - - return _consumer.isDone(); + _consumer.consume(new RobustSignatureImpl(sig.id(), blocks)); } @Override diff --git a/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSizeSketchFactory.java b/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSizeSketchFactory.java index c6d5c2e..baa8317 100644 --- a/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSizeSketchFactory.java +++ b/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSizeSketchFactory.java @@ -17,7 +17,7 @@ */ package org.opendata.curation.d4.signature.sketch; -import org.opendata.curation.d4.signature.SignatureBlocksConsumer; +import org.opendata.curation.d4.signature.RobustSignatureConsumer; /** * Factory for consumer that implement the size-threshold sketch strategy for @@ -35,7 +35,7 @@ public SignatureBlocksSizeSketchFactory(int n) { } @Override - public SignatureBlocksConsumer getConsumer(SignatureBlocksConsumer consumer) { + public RobustSignatureConsumer getConsumer(RobustSignatureConsumer consumer) { return new SignatureBlocksSizeSketch(_n, consumer); } diff --git a/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSketchFactory.java b/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSketchFactory.java index b348c6a..048085c 100644 --- a/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSketchFactory.java +++ b/src/main/java/org/opendata/curation/d4/signature/sketch/SignatureBlocksSketchFactory.java @@ -17,7 +17,7 @@ */ package org.opendata.curation.d4.signature.sketch; -import org.opendata.curation.d4.signature.SignatureBlocksConsumer; +import org.opendata.curation.d4.signature.RobustSignatureConsumer; /** * Factory pattern for generating signature blocks consumers for sketches. @@ -34,7 +34,7 @@ public interface SignatureBlocksSketchFactory { * @param consumer * @return */ - public SignatureBlocksConsumer getConsumer(SignatureBlocksConsumer consumer); + public RobustSignatureConsumer getConsumer(RobustSignatureConsumer consumer); /** * Get documentation string for the signature blocks sketches that are diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/CentristBlockRelevanceFilter.java b/src/main/java/org/opendata/curation/d4/signature/trim/CentristBlockRelevanceFilter.java deleted file mode 100644 index f9477cd..0000000 --- a/src/main/java/org/opendata/curation/d4/signature/trim/CentristBlockRelevanceFilter.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * This file is part of the Data-Driven Domain Discovery Tool (D4). - * - * Copyright (c) 2018-2020 New York University. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.opendata.curation.d4.signature.trim; - -import java.math.BigDecimal; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import org.opendata.core.constraint.GreaterThanConstraint; -import org.opendata.curation.d4.signature.SignatureBlocks; -import org.opendata.curation.d4.signature.SignatureBlocksConsumer; -import org.opendata.core.constraint.Threshold; -import org.opendata.core.constraint.ZeroThreshold; -import org.opendata.core.object.IdentifiableDouble; -import org.opendata.core.object.filter.AnyObjectFilter; -import org.opendata.core.prune.CandidateSetFinder; -import org.opendata.core.prune.MaxDropFinder; -import org.opendata.core.set.IDSet; -import org.opendata.core.set.IdentifiableIDSet; -import org.opendata.core.set.IdentifiableObjectSet; -import org.opendata.core.sort.DoubleValueDescSort; -import org.opendata.db.eq.EQIndex; - -/** - * Liberal signature blocks trimmer. The liberal trimmer prunes all - * blocks starting from the block with the most elements. Only if the first - * block is the largest block it will not be pruned. - * - * @author Heiko Mueller - */ -public class CentristBlockRelevanceFilter extends SignatureTrimmer { - - private final CandidateSetFinder _dropFinder; - private final EQIndex _eqIndex; - private final BlockScoreFunction _scoreFunc; - - public CentristBlockRelevanceFilter( - EQIndex eqIndex, - BlockScoreFunction scoreFunc, - CandidateSetFinder dropFinder, - Threshold nonEmptyConstraint, - SignatureBlocksConsumer consumer - ) { - super(new AnyObjectFilter(), nonEmptyConstraint, consumer); - - _eqIndex = eqIndex; - _scoreFunc = scoreFunc; - _dropFinder = dropFinder; - } - - public CentristBlockRelevanceFilter( - EQIndex eqIndex, - BlockScoreFunction scoreFunc, - Threshold nonEmptyConstraint, - SignatureBlocksConsumer consumer - ) { - - this( - eqIndex, - scoreFunc, - new MaxDropFinder<>( - new GreaterThanConstraint(BigDecimal.ZERO), - false, - false - ), - nonEmptyConstraint, - consumer - ); - } - - public CentristBlockRelevanceFilter( - EQIndex eqIndex, - BlockScoreFunction scoreFunc, - SignatureBlocksConsumer consumer - ) { - - this(eqIndex, scoreFunc, new ZeroThreshold(), consumer); - } - - public CentristBlockRelevanceFilter( - EQIndex eqIndex, - IdentifiableObjectSet columns, - SignatureBlocksConsumer consumer - ) { - - this(eqIndex, new PrecisionScore(eqIndex, columns), new ZeroThreshold(), consumer); - } - - @Override - public void trim(SignatureBlocks sig, SignatureBlocksConsumer consumer) { - - IDSet columns = _eqIndex.get(sig.id()).columns(); - - List elements = new ArrayList<>(); - for (int iBlock = 0; iBlock < sig.size(); iBlock++) { - int[] block = sig.get(iBlock); - Arrays.sort(block); - BigDecimal score = _scoreFunc.maxScore(block, columns); - elements.add(new IdentifiableDouble(iBlock, score)); - } - Collections.sort(elements, new DoubleValueDescSort()); - int dropIndex = _dropFinder.getPruneIndex(elements); - if (dropIndex > 0) { - if (elements.get(0).value() > 0) { - for (int i = 0; i < elements.size(); i++) { - IdentifiableDouble e = elements.get(i); - } - consumer.consume(new CentristSignature(sig, elements, dropIndex)); - } - } - } -} diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/CentristSignature.java b/src/main/java/org/opendata/curation/d4/signature/trim/CentristSignature.java index 7438f4d..1bb0aca 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/CentristSignature.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/CentristSignature.java @@ -18,7 +18,7 @@ package org.opendata.curation.d4.signature.trim; import java.util.List; -import org.opendata.curation.d4.signature.SignatureBlocks; +import org.opendata.curation.d4.signature.RobustSignature; import org.opendata.core.object.IdentifiableDouble; /** @@ -28,17 +28,17 @@ * * @author Heiko Mueller */ -public class CentristSignature extends SignatureBlocks { +public class CentristSignature extends RobustSignature { private final List _elements; - private final SignatureBlocks _sig; + private final RobustSignature _sig; public CentristSignature( - SignatureBlocks sig, + RobustSignature sig, List elements, int dropIndex ) { - super(sig.id(), sig.maxSim(), dropIndex); + super(sig.id(), dropIndex); _sig = sig; _elements = elements; diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java b/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java index a90b076..b6dad56 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java @@ -21,11 +21,8 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; -import org.opendata.curation.d4.signature.SignatureBlocks; -import org.opendata.curation.d4.signature.SignatureBlocksConsumer; +import org.opendata.curation.d4.signature.RobustSignature; import org.opendata.core.constraint.GreaterThanConstraint; -import org.opendata.core.constraint.Threshold; -import org.opendata.core.constraint.ZeroThreshold; import org.opendata.core.object.IdentifiableDouble; import org.opendata.core.prune.CandidateSetFinder; import org.opendata.core.prune.MaxDropFinder; @@ -33,6 +30,7 @@ import org.opendata.core.set.IDSet; import org.opendata.core.set.IdentifiableIDSet; import org.opendata.core.sort.DoubleValueDescSort; +import org.opendata.curation.d4.signature.RobustSignatureConsumer; /** * Centrist signature blocks trimmer. The centrist trimmer uses a scoring @@ -52,10 +50,9 @@ public CentristTrimmer( IdentifiableIDSet column, BlockScoreFunction scoreFunc, CandidateSetFinder dropFinder, - Threshold nonEmptyConstraint, - SignatureBlocksConsumer consumer + RobustSignatureConsumer consumer ) { - super(column, nonEmptyConstraint, consumer); + super(column, consumer); _columnId = column.id(); _scoreFunc = scoreFunc; @@ -65,8 +62,7 @@ public CentristTrimmer( public CentristTrimmer( IdentifiableIDSet column, BlockScoreFunction scoreFunc, - Threshold nonEmptyConstraint, - SignatureBlocksConsumer consumer + RobustSignatureConsumer consumer ) { this( @@ -77,39 +73,12 @@ public CentristTrimmer( false, false ), - nonEmptyConstraint, consumer ); } - public CentristTrimmer( - IdentifiableIDSet column, - BlockScoreFunction scoreFunc, - SignatureBlocksConsumer consumer - ) { - - this( - column, - scoreFunc, - new ZeroThreshold(), - consumer - ); - } - - public CentristTrimmer( - IdentifiableIDSet column, - BlockScoreFunction scoreFunc - ) { - - this( - column, - scoreFunc, - null - ); - } - @Override - public void trim(SignatureBlocks sig, SignatureBlocksConsumer consumer) { + public void trim(RobustSignature sig, RobustSignatureConsumer consumer) { List elements = new ArrayList<>(); for (int iBlock = 0; iBlock < sig.size(); iBlock++) { @@ -126,7 +95,7 @@ public void trim(SignatureBlocks sig, SignatureBlocksConsumer consumer) { } } - public IDSet trimmedBlocks(SignatureBlocks sig) { + public IDSet trimmedBlocks(RobustSignature sig) { List elements = new ArrayList<>(); for (int iBlock = 0; iBlock < sig.size(); iBlock++) { diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/ColumnSupportBlockFilter.java b/src/main/java/org/opendata/curation/d4/signature/trim/ColumnSupportBlockFilter.java index 350f866..a90a00f 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/ColumnSupportBlockFilter.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/ColumnSupportBlockFilter.java @@ -17,13 +17,12 @@ */ package org.opendata.curation.d4.signature.trim; -import org.opendata.curation.d4.signature.SignatureBlocks; +import java.util.List; import org.opendata.curation.d4.signature.SignatureBlocksConsumer; import org.opendata.core.constraint.Threshold; import org.opendata.core.constraint.ZeroThreshold; -import org.opendata.core.object.filter.AnyObjectFilter; import org.opendata.core.set.IDSet; -import org.opendata.curation.d4.signature.SignatureBlocksImpl; +import org.opendata.curation.d4.signature.SignatureBlock; import org.opendata.db.eq.EQIndex; /** @@ -32,7 +31,7 @@ * * @author Heiko Mueller */ -public class ColumnSupportBlockFilter extends SignatureTrimmer { +public class ColumnSupportBlockFilter extends SignatureRobustifier { private final EQIndex _eqIndex; private final int _minStart; @@ -43,7 +42,7 @@ public ColumnSupportBlockFilter( Threshold nonEmptyConstraint, SignatureBlocksConsumer consumer ) { - super(new AnyObjectFilter(), nonEmptyConstraint, consumer); + super(consumer); _eqIndex = eqIndex; _minStart = minStart; @@ -67,15 +66,15 @@ public ColumnSupportBlockFilter( } @Override - public void trim(SignatureBlocks sig, SignatureBlocksConsumer consumer) { + public void consume(int nodeId, List blocks) { - IDSet nodeColumns = _eqIndex.get(sig.id()).columns(); + IDSet nodeColumns = _eqIndex.get(nodeId).columns(); int lastIndex = 0; - for (int iBlock = 0; iBlock < sig.size(); iBlock++) { + for (SignatureBlock block : blocks) { IDSet columns = nodeColumns; - for (int nodeId : sig.get(iBlock)) { - columns = columns.intersect(_eqIndex.get(nodeId).columns()); + for (int memberId : block.elements()) { + columns = columns.intersect(_eqIndex.get(memberId).columns()); if (columns.isEmpty()) { break; } @@ -85,11 +84,6 @@ public void trim(SignatureBlocks sig, SignatureBlocksConsumer consumer) { } lastIndex++; } - int sigSize = Math.max(_minStart, lastIndex); - int[][] blocks = new int[sigSize][]; - for (int iBlock = 0; iBlock < blocks.length; iBlock++) { - blocks[iBlock] = sig.get(iBlock); - } - consumer.consume(new SignatureBlocksImpl(sig.id(), sig.maxSim(), blocks)); + this.push(nodeId, blocks, Math.max(_minStart, lastIndex)); } } diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/ConservativeTrimmer.java b/src/main/java/org/opendata/curation/d4/signature/trim/ConservativeTrimmer.java index 8014557..a1b66cc 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/ConservativeTrimmer.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/ConservativeTrimmer.java @@ -17,12 +17,10 @@ */ package org.opendata.curation.d4.signature.trim; -import org.opendata.curation.d4.signature.SignatureBlocks; -import org.opendata.curation.d4.signature.SignatureBlocksConsumer; -import org.opendata.curation.d4.signature.SignatureBlocksImpl; -import org.opendata.core.constraint.Threshold; -import org.opendata.core.constraint.ZeroThreshold; +import org.opendata.curation.d4.signature.RobustSignature; +import org.opendata.curation.d4.signature.RobustSignatureImpl; import org.opendata.core.set.IDSet; +import org.opendata.curation.d4.signature.RobustSignatureConsumer; /** * Conservative signature blocks trimmer. The conservative trimmer prunes all @@ -34,22 +32,16 @@ public class ConservativeTrimmer extends SignatureTrimmer { public ConservativeTrimmer( IDSet column, - Threshold nonEmptyConstraint, - SignatureBlocksConsumer consumer + RobustSignatureConsumer consumer ) { - super(column, nonEmptyConstraint, consumer); - } - - public ConservativeTrimmer(IDSet column, SignatureBlocksConsumer consumer) { - - this(column, new ZeroThreshold(), consumer); + super(column, consumer); } @Override - public void trim(SignatureBlocks sig, SignatureBlocksConsumer consumer) { + public void trim(RobustSignature sig, RobustSignatureConsumer consumer) { int[][] block = new int[1][]; block[0] = sig.get(0); - consumer.consume(new SignatureBlocksImpl(sig.id(), sig.maxSim(), block)); + consumer.consume(new RobustSignatureImpl(sig.id(), block)); } } diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/LiberalTrimmer.java b/src/main/java/org/opendata/curation/d4/signature/trim/LiberalRobustifier.java similarity index 54% rename from src/main/java/org/opendata/curation/d4/signature/trim/LiberalTrimmer.java rename to src/main/java/org/opendata/curation/d4/signature/trim/LiberalRobustifier.java index d1ea518..287c5fc 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/LiberalTrimmer.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/LiberalRobustifier.java @@ -17,12 +17,9 @@ */ package org.opendata.curation.d4.signature.trim; -import org.opendata.curation.d4.signature.SignatureBlocks; +import java.util.List; import org.opendata.curation.d4.signature.SignatureBlocksConsumer; -import org.opendata.curation.d4.signature.SignatureBlocksImpl; -import org.opendata.core.constraint.Threshold; -import org.opendata.core.constraint.ZeroThreshold; -import org.opendata.core.object.filter.AnyObjectFilter; +import org.opendata.curation.d4.signature.SignatureBlock; /** * Liberal signature blocks trimmer. The liberal trimmer prunes all @@ -31,35 +28,26 @@ * * @author Heiko Mueller */ -public class LiberalTrimmer extends SignatureTrimmer { +public class LiberalRobustifier extends SignatureRobustifier { private final int[] _nodeSizes; - public LiberalTrimmer( - int[] nodeSizes, - Threshold nonEmptyConstraint, - SignatureBlocksConsumer consumer - ) { - super(new AnyObjectFilter(), nonEmptyConstraint, consumer); + public LiberalRobustifier(int[] nodeSizes, SignatureBlocksConsumer consumer) { + super(consumer); _nodeSizes = nodeSizes; } - - public LiberalTrimmer(int[] nodeSizes, SignatureBlocksConsumer consumer) { - - this(nodeSizes, new ZeroThreshold(), consumer); - } @Override - public void trim(SignatureBlocks sig, SignatureBlocksConsumer consumer) { + public void consume(int nodeId, List blocks) { int index = 0; int maxIndex = -1; int maxSize = -1; - for (int iBlock = 0; iBlock < sig.size(); iBlock++) { + for (SignatureBlock block : blocks) { int size = 0; - for (int nodeId : sig.get(iBlock)) { - size += _nodeSizes[nodeId]; + for (int memberId : block.elements()) { + size += _nodeSizes[memberId]; } if (size > maxSize) { maxSize = size; @@ -67,10 +55,6 @@ public void trim(SignatureBlocks sig, SignatureBlocksConsumer consumer) { } index++; } - int[][] blocks = new int[Math.max(1, maxIndex)][]; - for (int iBlock = 0; iBlock < blocks.length; iBlock++) { - blocks[iBlock] = sig.get(iBlock); - } - consumer.consume(new SignatureBlocksImpl(sig.id(), sig.maxSim(), blocks)); + this.push(nodeId, blocks, Math.max(1, maxIndex)); } } diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/NonTrimmer.java b/src/main/java/org/opendata/curation/d4/signature/trim/NonTrimmer.java index 0eee3a9..22adb20 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/NonTrimmer.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/NonTrimmer.java @@ -17,11 +17,9 @@ */ package org.opendata.curation.d4.signature.trim; -import org.opendata.curation.d4.signature.SignatureBlocks; -import org.opendata.curation.d4.signature.SignatureBlocksConsumer; -import org.opendata.core.constraint.Threshold; -import org.opendata.core.constraint.ZeroThreshold; +import org.opendata.curation.d4.signature.RobustSignature; import org.opendata.core.set.IDSet; +import org.opendata.curation.d4.signature.RobustSignatureConsumer; /** * Signature trimmer that simply returns the complete list of signature blocks. @@ -32,19 +30,13 @@ public class NonTrimmer extends SignatureTrimmer { public NonTrimmer( IDSet column, - Threshold nonEmptyConstraint, - SignatureBlocksConsumer consumer + RobustSignatureConsumer consumer ) { - super(column, nonEmptyConstraint, consumer); - } - - public NonTrimmer(IDSet column, SignatureBlocksConsumer consumer) { - - this(column, new ZeroThreshold(), consumer); + super(column, consumer); } @Override - public void trim(SignatureBlocks sig, SignatureBlocksConsumer consumer) { + public void trim(RobustSignature sig, RobustSignatureConsumer consumer) { consumer.consume(sig); } diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureRobustifier.java b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureRobustifier.java new file mode 100644 index 0000000..82d9a3b --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureRobustifier.java @@ -0,0 +1,83 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.signature.trim; + +import java.util.ArrayList; +import java.util.List; +import org.opendata.curation.d4.signature.SignatureBlock; +import org.opendata.curation.d4.signature.SignatureBlocksConsumer; + +/** + * Base class for generating robust signatures for individual equivalence + * classes. Robust signatures are context signatures that have been divided + * into signature blocks. A robust signature will only maintain those blocks + * that are not classified as noisy blocks. The definition of what constitutes + * a noisy block is implementation dependent. + * + * @author Heiko Mueller + */ +public abstract class SignatureRobustifier implements SignatureBlocksConsumer { + + /** + * Global variables for trimmer types + */ + public final static String COLSUPP = "COLSUPP"; + public final static String LIBERAL = "LIBERAL"; + + private final SignatureBlocksConsumer _consumer; + + /** + * Initialize the consumer for the robust signature blocks. + * + * @param consumer + */ + public SignatureRobustifier(SignatureBlocksConsumer consumer) { + + _consumer = consumer; + } + + @Override + public void close() { + + _consumer.close(); + } + + @Override + public void open() { + + _consumer.open(); + } + + /** + * Push robust signature to associated consumer. Passes only a prefix of the + * block list to the underlying consumer. + * + * @param nodeId + * @param blocks + * @param end + */ + public void push(int nodeId, List blocks, int end) { + + ArrayList prunedBlocks = new ArrayList<>(); + for (int iBlock = 0; iBlock < end; iBlock++) { + prunedBlocks.add(blocks.get(iBlock)); + } + + _consumer.consume(nodeId, prunedBlocks); + } +} diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmer.java b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmer.java index 4a2763e..536b46e 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmer.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmer.java @@ -17,10 +17,9 @@ */ package org.opendata.curation.d4.signature.trim; -import org.opendata.curation.d4.signature.SignatureBlocks; -import org.opendata.curation.d4.signature.SignatureBlocksConsumer; -import org.opendata.core.constraint.Threshold; +import org.opendata.curation.d4.signature.RobustSignature; import org.opendata.core.object.filter.ObjectFilter; +import org.opendata.curation.d4.signature.RobustSignatureConsumer; /** * Base class for signature trimmer. The trimmer is used to generate robust @@ -29,38 +28,31 @@ * * @author Heiko Mueller */ -public abstract class SignatureTrimmer implements SignatureBlocksConsumer { +public abstract class SignatureTrimmer implements RobustSignatureConsumer { /** * Global variables for trimmer types */ public final static String CENTRIST = "CENTRIST"; - public final static String COLSUPP = "COLSUPP"; public final static String CONSERVATIVE = "CONSERVATIVE"; public final static String LIBERAL = "LIBERAL"; - private final SignatureBlocksConsumer _consumer; + private final RobustSignatureConsumer _consumer; private final ObjectFilter _filter; - private final Threshold _nonEmptyConstraint; /** - * Initialize the consumer for trimmed signatures, the column filter, and - * the empty signature constraint. The filter is used to ensure that only - * the signature for column elements are being trimmed. The constraint - * determines which signatures are being empty (based on the value of the - * most similar node in the signature). + * Initialize the consumer for trimmed signatures and the column filter. + * The filter is used to ensure that only the signature for column elements + * are being trimmed. * * @param consumer * @param filter - * @param nonEmptyConstraint */ public SignatureTrimmer( ObjectFilter filter, - Threshold nonEmptyConstraint, - SignatureBlocksConsumer consumer + RobustSignatureConsumer consumer ) { _filter = filter; - _nonEmptyConstraint = nonEmptyConstraint; _consumer = consumer; } @@ -71,20 +63,12 @@ public void close() { } @Override - public void consume(SignatureBlocks sig) { + public void consume(RobustSignature sig) { if ((_filter.contains(sig.id())) && (!sig.isEmpty())) { - if (_nonEmptyConstraint.isSatisfied(sig.maxSim())) { - this.trim(sig, _consumer); - } + this.trim(sig, _consumer); } } - - @Override - public boolean isDone() { - - return _consumer.isDone(); - } @Override public void open() { @@ -98,5 +82,5 @@ public void open() { * @param sig * @param consumer */ - public abstract void trim(SignatureBlocks sig, SignatureBlocksConsumer consumer); + public abstract void trim(RobustSignature sig, RobustSignatureConsumer consumer); } diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java index 7cf6d4c..caa233b 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/SignatureTrimmerFactory.java @@ -17,10 +17,9 @@ */ package org.opendata.curation.d4.signature.trim; -import org.opendata.core.constraint.Threshold; -import org.opendata.curation.d4.signature.SignatureBlocksConsumer; import org.opendata.core.set.IdentifiableIDSet; import org.opendata.core.set.IdentifiableObjectSet; +import org.opendata.curation.d4.signature.RobustSignatureConsumer; import org.opendata.db.eq.EQIndex; /** @@ -45,22 +44,6 @@ public SignatureTrimmerFactory( _trimmerSpec = trimmerSpec; } - /** - * Create an instance for a signature trimmer that is column independent. - * - * @param consumer - * @return - */ - public SignatureTrimmer getTrimmer(SignatureBlocksConsumer consumer) { - - if (_trimmerSpec.equals(SignatureTrimmer.COLSUPP)) { - return new ColumnSupportBlockFilter(_nodes,consumer); - } else if (_trimmerSpec.equals(SignatureTrimmer.LIBERAL)) { - return new LiberalTrimmer(_nodes.nodeSizes(), consumer); - } - throw new IllegalArgumentException(String.format("Invalid trimmer: %s", _trimmerSpec)); - } - /** * Get column specific trimmer for a given column. We currently do not make * use of the empty signature constraint. @@ -69,7 +52,7 @@ public SignatureTrimmer getTrimmer(SignatureBlocksConsumer consumer) { * @param consumer * @return */ - public SignatureTrimmer getTrimmer(int columnId, SignatureBlocksConsumer consumer) { + public SignatureTrimmer getTrimmer(int columnId, RobustSignatureConsumer consumer) { IdentifiableIDSet column = _columns.get(columnId); if (_trimmerSpec.equals(SignatureTrimmer.CONSERVATIVE)) { @@ -79,19 +62,6 @@ public SignatureTrimmer getTrimmer(int columnId, SignatureBlocksConsumer consume _scoreFunc = new PrecisionScore(_nodes, _columns); } return new CentristTrimmer(column, _scoreFunc, consumer); - } else if (_trimmerSpec.startsWith(SignatureTrimmer.CENTRIST)) { - int pos = _trimmerSpec.indexOf(":"); - if (pos != -1) { - if (_scoreFunc == null) { - _scoreFunc = new PrecisionScore(_nodes, _columns); - } - return new CentristTrimmer( - column, - _scoreFunc, - Threshold.getConstraint(_trimmerSpec.substring(pos + 1)), - consumer - ); - } } else if (_trimmerSpec.equals(SignatureTrimmer.LIBERAL)) { return new NonTrimmer(column, consumer); } From 122746ddc3ca257b6df6bad9e3cb5a59919b6b9f Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Fri, 11 Dec 2020 11:30:49 -0500 Subject: [PATCH 24/25] Refactor steepest drop finder to return drop object --- pom.xml | 2 +- .../core/prune/CandidateSetFinder.java | 51 ----- .../prune/{ThresholdFinder.java => Drop.java} | 54 +++--- .../opendata/core/prune/MaxDropFinder.java | 60 ++++-- .../org/opendata/core/prune/OtsuMethod.java | 3 +- .../org/opendata/curation/d4/Constants.java | 2 +- .../SignatureDropStatsExperiment.java | 7 +- .../signature/RobustSignatureGenerator.java | 129 +++++++------ .../signature/SignatureBlocksGenerator.java | 118 ++++++++++++ .../d4/signature/trim/CentristTrimmer.java | 5 +- .../data/test/prune/MaxDropFinderTest.java | 4 +- .../prune/SignatureBlocksGeneratorTest.java | 174 ++++++++++++++++++ 12 files changed, 442 insertions(+), 167 deletions(-) delete mode 100644 src/main/java/org/opendata/core/prune/CandidateSetFinder.java rename src/main/java/org/opendata/core/prune/{ThresholdFinder.java => Drop.java} (51%) create mode 100644 src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java create mode 100644 src/test/java/org/urban/data/test/prune/SignatureBlocksGeneratorTest.java diff --git a/pom.xml b/pom.xml index 4eaeea0..ff6f892 100644 --- a/pom.xml +++ b/pom.xml @@ -71,7 +71,7 @@ true org.opendata.curation.d4.D4 - + diff --git a/src/main/java/org/opendata/core/prune/CandidateSetFinder.java b/src/main/java/org/opendata/core/prune/CandidateSetFinder.java deleted file mode 100644 index cae5cef..0000000 --- a/src/main/java/org/opendata/core/prune/CandidateSetFinder.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * This file is part of the Data-Driven Domain Discovery Tool (D4). - * - * Copyright (c) 2018-2020 New York University. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.opendata.core.prune; - -import java.util.List; -import org.opendata.core.object.IdentifiableDouble; - -/** - * For a given list of identifiable double, find the pruning index for an - * implementation-specific pruning condition. - * - * @author Heiko Mueller - * @param - */ -public abstract class CandidateSetFinder { - - /** - * Return the pruning index. - * - * @param elements - * @return - */ - public int getPruneIndex(List elements) { - - return this.getPruneIndex(elements, 0); - } - - /** - * Return pruning index after the given start position. - * - * @param elements - * @param start - * @return - */ - public abstract int getPruneIndex(List elements, int start); -} diff --git a/src/main/java/org/opendata/core/prune/ThresholdFinder.java b/src/main/java/org/opendata/core/prune/Drop.java similarity index 51% rename from src/main/java/org/opendata/core/prune/ThresholdFinder.java rename to src/main/java/org/opendata/core/prune/Drop.java index 2f2bb1a..8153d99 100644 --- a/src/main/java/org/opendata/core/prune/ThresholdFinder.java +++ b/src/main/java/org/opendata/core/prune/Drop.java @@ -17,33 +17,43 @@ */ package org.opendata.core.prune; -import java.util.List; -import org.opendata.core.constraint.Threshold; -import org.opendata.core.object.IdentifiableDouble; - /** - * Return list of candidates that satisfy a given threshold constraint. + * Steepest drop information. Contains the right boundary of the drop and + * the difference. Also contains a flag indicating whether the drop is due + * to the full signature constraint. * * @author Heiko Mueller - * @param */ -public class ThresholdFinder extends CandidateSetFinder { - - private final Threshold _constraint; - - public ThresholdFinder(Threshold constraint) { +public class Drop { + + private final double _diff; + private final int _index; + private final boolean _isFullSignature; + + public Drop(int index, double diff, boolean isFullSignature) { - _constraint = constraint; + _index = index; + _diff = diff; + _isFullSignature = isFullSignature; } - - @Override - public int getPruneIndex(List elements, int start) { - - for (int iIndex = start; iIndex < elements.size(); iIndex++) { - if (!_constraint.isSatisfied(elements.get(iIndex).value())) { - return iIndex; - } - } - return elements.size(); + + public Drop() { + + this(0, 0., false); + } + + public double diff() { + + return _diff; + } + + public int index() { + + return _index; + } + + public boolean isFullSignature() { + + return _isFullSignature; } } diff --git a/src/main/java/org/opendata/core/prune/MaxDropFinder.java b/src/main/java/org/opendata/core/prune/MaxDropFinder.java index 1199eab..898feb3 100644 --- a/src/main/java/org/opendata/core/prune/MaxDropFinder.java +++ b/src/main/java/org/opendata/core/prune/MaxDropFinder.java @@ -20,6 +20,7 @@ import java.util.List; import org.opendata.core.constraint.GreaterThanConstraint; import org.opendata.core.constraint.Threshold; +import org.opendata.core.constraint.ZeroThreshold; import org.opendata.core.object.IdentifiableDouble; /** @@ -28,7 +29,7 @@ * @author Heiko Mueller * @param */ -public class MaxDropFinder extends CandidateSetFinder { +public class MaxDropFinder { private final Threshold _nonEmptySignatureThreshold; private final boolean _ignoreLastDrop; @@ -56,16 +57,24 @@ public MaxDropFinder( ); } + public MaxDropFinder(boolean fullSignatureConstraint, boolean ignoreLastDrop) { + + this(new ZeroThreshold(), fullSignatureConstraint, ignoreLastDrop); + } + /** - * Get index position of steepest drop in a list of double values. + * Get index position and difference for steepest drop in a list of double + * values. * - * Assumes that the list is sorted in decreasing order. Returns the index - * position of the element on the right side of the steepest drop. + * Assumes that the list is sorted in decreasing order. Returns an object + * that contains the index position of the element on the right side of the + * steepest drop. * - * If the list is empty the result is 0. If the first element is smaller + * If the list is empty the drop index is 0. If the first element is smaller * than the empty constraint threshold the result is 0. If the full * signature constraint is satisfied the result is the size of the element - * vector. + * vector. This will be indicated by the isFullSignature flag in the + * returned object. * * If the list contains a single element the result is 1 or 0 (in case the * empty signature constraint is satisfied). @@ -74,34 +83,34 @@ public MaxDropFinder( * @param start * @return */ - @Override - public int getPruneIndex(List elements, int start) { + public Drop getSteepestDrop(List elements, int start) { final int size = elements.size(); // Result is zero if element list is empty. if (start >= size) { - return 0; + return new Drop(); } // Result is zero if first element is smaller than the empty signature // constraint threshold. final double first = elements.get(0).value(); if (!_nonEmptySignatureThreshold.isSatisfied(first)) { - return 0; + return new Drop(); } // Return 1 if the size of the list is one if ((size - start) == 1) { - return start + 1; + return new Drop(start + 1, 0., true); } // If the full signature constraint is satisfied the result equals the // size of the array final double last = elements.get(size - 1).value(); if (_fullSignatureConstraint) { - if ((elements.get(start).value() - last) < last) { - return size; + double diff = elements.get(start).value() - last; + if (diff < last) { + return new Drop(size, diff, true); } } @@ -120,6 +129,29 @@ public int getPruneIndex(List elements, int start) { if ((!_ignoreLastDrop) && (last > maxDiff)) { maxIndex = size; } - return maxIndex; + return new Drop(maxIndex, maxDiff, false); + } + + /** + * Return the pruning index. + * + * @param elements + * @return + */ + public int getPruneIndex(List elements) { + + return this.getPruneIndex(elements, 0); + } + + /** + * Return pruning index after the given start position. + * + * @param elements + * @param start + * @return + */ + public int getPruneIndex(List elements, int start) { + + return this.getSteepestDrop(elements, start).index(); } } diff --git a/src/main/java/org/opendata/core/prune/OtsuMethod.java b/src/main/java/org/opendata/core/prune/OtsuMethod.java index 1dbdd50..2330055 100644 --- a/src/main/java/org/opendata/core/prune/OtsuMethod.java +++ b/src/main/java/org/opendata/core/prune/OtsuMethod.java @@ -31,7 +31,7 @@ * @author Heiko Mueller * @param */ -public class OtsuMethod extends CandidateSetFinder { +public class OtsuMethod { private final int _scale; private final SizeFunction _sizeFunc; @@ -47,7 +47,6 @@ public OtsuMethod(SizeFunction sizeFunc) { this(sizeFunc, 3); } - @Override public int getPruneIndex(List elements, int start) { SimilarityHistogram histogram = new SimilarityHistogram(_scale); diff --git a/src/main/java/org/opendata/curation/d4/Constants.java b/src/main/java/org/opendata/curation/d4/Constants.java index ff8f1a2..f6a4d37 100644 --- a/src/main/java/org/opendata/curation/d4/Constants.java +++ b/src/main/java/org/opendata/curation/d4/Constants.java @@ -26,5 +26,5 @@ public final class Constants { public static final String NAME = "D4 - Data-Driven Domain Discovery"; - public static final String VERSION = "0.29.0.dev18"; + public static final String VERSION = "0.29.0.dev19"; } \ No newline at end of file diff --git a/src/main/java/org/opendata/curation/d4/experiments/SignatureDropStatsExperiment.java b/src/main/java/org/opendata/curation/d4/experiments/SignatureDropStatsExperiment.java index 3195106..006f3b6 100644 --- a/src/main/java/org/opendata/curation/d4/experiments/SignatureDropStatsExperiment.java +++ b/src/main/java/org/opendata/curation/d4/experiments/SignatureDropStatsExperiment.java @@ -32,7 +32,6 @@ import org.opendata.core.constraint.GreaterThanConstraint; import org.opendata.core.io.FileSystem; import org.opendata.core.io.SynchronizedWriter; -import org.opendata.core.prune.CandidateSetFinder; import org.opendata.core.prune.MaxDropFinder; import org.opendata.db.eq.EQ; import org.opendata.db.eq.EQIndex; @@ -47,7 +46,7 @@ public class SignatureDropStatsExperiment { private class BlockGeneratorTask implements Runnable { - private final CandidateSetFinder _candidateFinder; + private final MaxDropFinder _candidateFinder; private final EQIndex _eqIndex; private final ConcurrentLinkedQueue _queue; private final ContextSignatureGenerator _sigFact; @@ -57,7 +56,7 @@ public BlockGeneratorTask( EQIndex eqIndex, ConcurrentLinkedQueue queue, ContextSignatureGenerator sigFact, - CandidateSetFinder candidateFinder, + MaxDropFinder candidateFinder, SynchronizedWriter writer ) { _eqIndex = eqIndex; @@ -116,7 +115,7 @@ public void run( EQIndex eqIndex, ContextSignatureGenerator sigFact, ConcurrentLinkedQueue queue, - CandidateSetFinder candidateFinder, + MaxDropFinder candidateFinder, int threads, SynchronizedWriter writer ) throws java.lang.InterruptedException, java.io.IOException { diff --git a/src/main/java/org/opendata/curation/d4/signature/RobustSignatureGenerator.java b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureGenerator.java index 88c3aed..d207403 100644 --- a/src/main/java/org/opendata/curation/d4/signature/RobustSignatureGenerator.java +++ b/src/main/java/org/opendata/curation/d4/signature/RobustSignatureGenerator.java @@ -17,19 +17,20 @@ */ package org.opendata.curation.d4.signature; +import java.io.File; +import java.io.IOException; import java.math.BigDecimal; -import java.util.ArrayList; -import java.util.Arrays; import java.util.Date; import java.util.List; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; +import java.util.logging.Level; +import java.util.logging.Logger; import org.opendata.curation.d4.telemetry.TelemetryCollector; import org.opendata.curation.d4.telemetry.TelemetryPrinter; import org.opendata.core.constraint.GreaterThanConstraint; -import org.opendata.core.prune.CandidateSetFinder; import org.opendata.core.prune.MaxDropFinder; import org.opendata.curation.d4.signature.trim.ColumnSupportBlockFilter; import org.opendata.curation.d4.signature.trim.LiberalRobustifier; @@ -60,88 +61,31 @@ public class RobustSignatureGenerator { */ private class BlockGeneratorTask implements Runnable { - private final CandidateSetFinder _candidateFinder; + private final SignatureBlocksGenerator _blockGenerator; private final SignatureBlocksConsumer _consumer; - private final boolean _ignoreMinorDrop; private final ConcurrentLinkedQueue _queue; private final ContextSignatureGenerator _sigFact; public BlockGeneratorTask( ConcurrentLinkedQueue queue, ContextSignatureGenerator sigFact, - CandidateSetFinder candidateFinder, - boolean ignoreMinorDrop, + SignatureBlocksGenerator blockGenerator, SignatureBlocksConsumer consumer ) { _queue = queue; _sigFact = sigFact; - _candidateFinder = candidateFinder; - _ignoreMinorDrop = ignoreMinorDrop; + _blockGenerator = blockGenerator; _consumer = consumer; } - private SignatureBlock getBlock(List sig, int start, int end) { - - int[] block = new int[end - start]; - for (int iEl = start; iEl < end; iEl++) { - block[iEl - start] = sig.get(iEl).id(); - } - Arrays.sort(block); - return new SignatureBlock( - block, - sig.get(start).value(), - sig.get(end - 1).value() - ); - } - @Override public void run() { Integer nodeId; while ((nodeId = _queue.poll()) != null) { - List sig; - sig = _sigFact.getSignature(nodeId).rankedElements(); - // No output if the context signautre is empty - if (sig.isEmpty()) { - continue; - } - ArrayList blocks = new ArrayList<>(); - int start = 0; - final int end = sig.size(); - while (start < end) { - int pruneIndex = _candidateFinder.getPruneIndex(sig, start); - if (pruneIndex <= start) { - break; - } - // If the ignoreMinorDrop flag is true check that the - // difference at the drop is at least as large as the - // difference between the elements in the block. - if (_ignoreMinorDrop) { - double rightBound = 0; - if (sig.size() < pruneIndex) { - rightBound = sig.get(pruneIndex).value(); - } - double leftBound = sig.get(pruneIndex - 1).value(); - double diff = leftBound - rightBound; - double blockDiff = sig.get(start).value() - leftBound; - if (blockDiff > diff) { - // We encountered a minor drop. If the list of - // blocks is empty (i.e., there is no steepest drop - // but the full signature constrant is not satisfied - // either) we break to return an empty signature. - // Otherwise, we add the remaining elements as the - // final block. - if (blocks.isEmpty()) { - break; - } else { - pruneIndex = end; - } - } - } - blocks.add(this.getBlock(sig, start, pruneIndex)); - start = pruneIndex; - } + List blocks = _blockGenerator + .toBlocks(_sigFact.getSignature(nodeId).rankedElements()); if (!blocks.isEmpty()) { _consumer.consume(nodeId, blocks); } @@ -246,8 +190,10 @@ public void run( new BlockGeneratorTask( queue, sigFact, - candidateFinder, - ignoreMinorDrop, + new SignatureBlocksGenerator( + candidateFinder, + ignoreMinorDrop + ), consumer ) ); @@ -271,4 +217,53 @@ public void run( _telemetry.add(TELEMETRY_ID, execTime); } } + + private static final String COMMAND = + "Usage:\n" + + " \n" + + " [LIBERAL | COLSUPP]\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " "; + + private static final Logger LOGGER = Logger + .getLogger(RobustSignatureGenerator.class.getName()); + + public static void main(String[] args) throws IOException { + + if (args.length != 7) { + System.out.println(COMMAND); + System.exit(-1); + } + + File eqFile = new File(args[0]); + String trimmerSpec = args[1].toUpperCase(); + boolean fullSignatureConstraint = Boolean.parseBoolean(args[2]); + boolean ignoreLastDrop = Boolean.parseBoolean(args[3]); + boolean ignoreMinorDrop = Boolean.parseBoolean(args[4]); + int nodeId = Integer.parseInt(args[5]); + File outputFile = new File(args[6]); + + ConcurrentLinkedQueue queue = new ConcurrentLinkedQueue<>(); + queue.add(nodeId); + + try { + new RobustSignatureGenerator().run( + new EQIndex(eqFile), + queue, + trimmerSpec, + fullSignatureConstraint, + ignoreLastDrop, + ignoreMinorDrop, + 1, + true, + new SignatureBlocksWriter(outputFile) + ); + } catch (java.lang.InterruptedException | java.io.IOException ex) { + LOGGER.log(Level.SEVERE, "RUN", ex); + System.exit(-1); + } + } } diff --git a/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java new file mode 100644 index 0000000..40484cd --- /dev/null +++ b/src/main/java/org/opendata/curation/d4/signature/SignatureBlocksGenerator.java @@ -0,0 +1,118 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.opendata.curation.d4.signature; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.opendata.core.prune.Drop; +import org.opendata.core.prune.MaxDropFinder; + +/** + * Generator that groups elements in a context signature into blocks based on + * steepest drop. + * + * @author Heiko Mueller + */ +public class SignatureBlocksGenerator { + + private final MaxDropFinder _dropFinder; + private final boolean _ignoreMinorDrop; + + public SignatureBlocksGenerator( + MaxDropFinder dropFinder, + boolean ignoreMinorDrop + ) { + + _dropFinder = dropFinder; + _ignoreMinorDrop = ignoreMinorDrop; + } + + private SignatureBlock getBlock(List sig, int start, int end) { + + int[] block = new int[end - start]; + for (int iEl = start; iEl < end; iEl++) { + block[iEl - start] = sig.get(iEl).id(); + } + Arrays.sort(block); + return new SignatureBlock( + block, + sig.get(start).value(), + sig.get(end - 1).value() + ); + } + + /** + * Group elements in a context signature into blocks.Assumes that the + elements in the context signature are sorted in decreasing order. + * + * @param sig + * @return + */ + public List toBlocks(List sig) { + + ArrayList blocks = new ArrayList<>(); + + // No output if the context signautre is empty + if (sig.isEmpty()) { + return blocks; + } + + int start = 0; + final int end = sig.size(); + while (start < end) { + Drop drop = _dropFinder.getSteepestDrop(sig, start); + /*if (verbose) { + System.out.println( + String.format( + "DROP @ %d (%s) WITH %f", + drop.index(), + Boolean.toString(drop.isFullSignature()), + drop.diff() + ) + ); + }*/ + int pruneIndex = drop.index(); + if (pruneIndex <= start) { + break; + } else if ((!drop.isFullSignature()) && (_ignoreMinorDrop)) { + // If the ignoreMinorDrop flag is true check that the + // difference at the drop is at least as large as the + // difference between the elements in the block. + double leftBound = sig.get(pruneIndex - 1).value(); + double blockDiff = sig.get(start).value() - leftBound; + if (blockDiff > drop.diff()) { + // We encountered a minor drop. If the list of + // blocks is empty (i.e., there is no steepest drop + // but the full signature constrant is not satisfied + // either) we break to return an empty signature. + // Otherwise, we add the remaining elements as the + // final block. + if (blocks.isEmpty()) { + return blocks; + } + pruneIndex = end; + } + } + blocks.add(this.getBlock(sig, start, pruneIndex)); + start = pruneIndex; + } + + return blocks; + } +} diff --git a/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java b/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java index b6dad56..994e077 100644 --- a/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java +++ b/src/main/java/org/opendata/curation/d4/signature/trim/CentristTrimmer.java @@ -24,7 +24,6 @@ import org.opendata.curation.d4.signature.RobustSignature; import org.opendata.core.constraint.GreaterThanConstraint; import org.opendata.core.object.IdentifiableDouble; -import org.opendata.core.prune.CandidateSetFinder; import org.opendata.core.prune.MaxDropFinder; import org.opendata.core.set.HashIDSet; import org.opendata.core.set.IDSet; @@ -43,13 +42,13 @@ public class CentristTrimmer extends SignatureTrimmer { private final int _columnId; - private final CandidateSetFinder _dropFinder; + private final MaxDropFinder _dropFinder; private final BlockScoreFunction _scoreFunc; public CentristTrimmer( IdentifiableIDSet column, BlockScoreFunction scoreFunc, - CandidateSetFinder dropFinder, + MaxDropFinder dropFinder, RobustSignatureConsumer consumer ) { super(column, consumer); diff --git a/src/test/java/org/urban/data/test/prune/MaxDropFinderTest.java b/src/test/java/org/urban/data/test/prune/MaxDropFinderTest.java index 183d1a9..da56100 100644 --- a/src/test/java/org/urban/data/test/prune/MaxDropFinderTest.java +++ b/src/test/java/org/urban/data/test/prune/MaxDropFinderTest.java @@ -26,10 +26,10 @@ import static org.junit.Assert.*; import org.opendata.core.object.IdentifiableDouble; import org.opendata.core.prune.MaxDropFinder; -import org.opendata.core.set.IDSet; /** - * + * Unit tests for the steepest drop finder. + * * @author Heiko Mueller */ public class MaxDropFinderTest { diff --git a/src/test/java/org/urban/data/test/prune/SignatureBlocksGeneratorTest.java b/src/test/java/org/urban/data/test/prune/SignatureBlocksGeneratorTest.java new file mode 100644 index 0000000..33da2d2 --- /dev/null +++ b/src/test/java/org/urban/data/test/prune/SignatureBlocksGeneratorTest.java @@ -0,0 +1,174 @@ +/* + * This file is part of the Data-Driven Domain Discovery Tool (D4). + * + * Copyright (c) 2018-2020 New York University. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.urban.data.test.prune; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import org.junit.After; +import org.junit.AfterClass; +import static org.junit.Assert.assertEquals; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.opendata.core.prune.MaxDropFinder; +import org.opendata.core.sort.DoubleValueDescSort; +import org.opendata.curation.d4.signature.SignatureBlock; +import org.opendata.curation.d4.signature.SignatureBlocksGenerator; +import org.opendata.curation.d4.signature.SignatureValue; + +/** + * Unit tests for the signature blocks generator. + * + * @author Heiko Mueller + */ +public class SignatureBlocksGeneratorTest { + + /** + * Helper method to generate a sorted list of signature values. + * + * @param values + * @return + */ + private static List toList(double[] values) { + + ArrayList result = new ArrayList<>(); + + for (double val : values) { + result.add(new SignatureValue(result.size(), val)); + } + + Collections.sort(result, new DoubleValueDescSort<>()); + + return result; + } + + @BeforeClass + public static void setUpClass() { + } + + @AfterClass + public static void tearDownClass() { + } + + @Before + public void setUp() { + } + + @After + public void tearDown() { + } + + @Test + public void testFullSignatureConstraint() { + + List values = toList(new double[]{0.8, 0.7, 0.5, 0.45}); + + List blocks; + + // Steepest drop finder that does not consider the full signature + // constraint. + MaxDropFinder dfNoConstraint = new MaxDropFinder<>(false, true); + blocks = new SignatureBlocksGenerator(dfNoConstraint, false).toBlocks(values); + + // Splits values in three blocks [0.8-0.7], [0.5], [0.45]. + assertEquals(3, blocks.size()); + assertEquals(2, blocks.get(0).length()); + assertEquals(1, blocks.get(1).length()); + assertEquals(1, blocks.get(2).length()); + + // Steepest drop finder that does consider the full signature constraint. + MaxDropFinder dfConstraint = new MaxDropFinder<>(true, true); + blocks = new SignatureBlocksGenerator(dfConstraint, false).toBlocks(values); + + // Splits values into one block. + assertEquals(1, blocks.size()); + assertEquals(4, blocks.get(0).length()); + + // If the last drop is considered we always end up with a single block. + for (boolean fullSig : new boolean[]{true, false}) { + MaxDropFinder df = new MaxDropFinder<>(fullSig, false); + blocks = new SignatureBlocksGenerator(df, false).toBlocks(values); + assertEquals(1, blocks.size()); + assertEquals(4, blocks.get(0).length()); + } + } + + @Test + public void testMinorDropDonstraint() { + + List values = toList(new double[]{ + 0.31, + 0.3, + 0.2, + 0.2, + 0.15, + 0.13, + 0.11, + 0.09, + 0.07, + 0.05, + 0.015, + 0.01, + 0.005, + 0.001 + }); + + // The full signature constraint or the last drop should not have an + // impact in this settting. + for (boolean fullSig : new boolean[]{true, false}) { + for (boolean ignoreLast : new boolean[]{true, false}) { + MaxDropFinder df = new MaxDropFinder<>(fullSig, ignoreLast); + List blocks = new SignatureBlocksGenerator(df, true) + .toBlocks(values); + assertEquals(3, blocks.size()); + assertEquals(2, blocks.get(0).length()); + assertEquals(2, blocks.get(1).length()); + assertEquals(10, blocks.get(2).length()); + } + } + + // Same deltas but with steepest last drop. + values = toList(new double[]{ + 0.91, + 0.9, + 0.8, + 0.8, + 0.75, + 0.73, + 0.71, + 0.69, + 0.67, + 0.65, + 0.615, + 0.61, + 0.605, + 0.601 + }); + + // In this case we need to ignore the last drop and the full signature + // constraint. + MaxDropFinder df = new MaxDropFinder<>(false, true); + List blocks = new SignatureBlocksGenerator(df, true) + .toBlocks(values); + assertEquals(3, blocks.size()); + assertEquals(2, blocks.get(0).length()); + assertEquals(2, blocks.get(1).length()); + assertEquals(10, blocks.get(2).length()); + } +} From c89a5d3309d5c541cd957fea953ab5d6dff0ca00 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Fri, 11 Dec 2020 11:32:48 -0500 Subject: [PATCH 25/25] Release 0.29.0 --- src/main/java/org/opendata/curation/d4/Constants.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/opendata/curation/d4/Constants.java b/src/main/java/org/opendata/curation/d4/Constants.java index f6a4d37..bdc84db 100644 --- a/src/main/java/org/opendata/curation/d4/Constants.java +++ b/src/main/java/org/opendata/curation/d4/Constants.java @@ -26,5 +26,5 @@ public final class Constants { public static final String NAME = "D4 - Data-Driven Domain Discovery"; - public static final String VERSION = "0.29.0.dev19"; + public static final String VERSION = "0.29.0"; } \ No newline at end of file