Skip to content

Commit

Permalink
Merge branch 'master' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
jan-at-the-hyve committed May 10, 2023
2 parents 2282c2a + c7073da commit 490350e
Show file tree
Hide file tree
Showing 8 changed files with 364 additions and 38 deletions.
17 changes: 17 additions & 0 deletions docs/WhiteRabbit.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,23 @@ To increase the memory (in this example to 2400m), either set the environment va
To lower the memory, set one of these variables to e.g. `-Xmx600m`.
If you have a 32-bit Java VM installed and problems persist, consider installing 64-bit Java.

### Temporary Directory for Apache POI
(This addresses [issue 293](https://github.com/OHDSI/WhiteRabbit/issues/293))

The Apache POI library is used for generating the scan report in Excel format. This library creates its own directory for
temporary files in the system temporary directory. In [issue 293](https://github.com/OHDSI/WhiteRabbit/issues/293) it has
been reported that this can cause problems in a multi-user environment, when multiple user attempt to create this directory
with too restrictive permissions (read-only for other users).
WhiteRabbit from version 0.10.9 attempts to circumvent this automatically, but this workaround can fail due
to concurrency problems. If you want to prevent this from happening entirely , you can set either the environment variable
```ORG_OHDSI_WHITERABBIT_POI_TMPDIR``` or the Java system property ```org.ohdsi.whiterabbit.poi.tmpdir``` to a
temporary directory of your choice when starting WhiteRabbit (best would be to add this to the ```whiteRabbit``` or
```whiteRabbit.bat``` script). Please note that this directory should exist before your start WhiteRabbit,
and that it should be writable by any user that may want to run WhiteRabbit.
For each user a separate subdirectory will be created, so that permission related conflicts should be avoided.
Also, WhiteRabbit now attempts to detect this situation before the scan starts. If this is detected,
the scan is not started, and the problem identified before the scan, instead of afterwards.

## Support
All source code, descriptions and input/output examples are available on GitHub: <https://github.com/OHDSI/WhiteRabbit>

Expand Down
4 changes: 2 additions & 2 deletions rabbit-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<apache-poi-version>4.1.2</apache-poi-version>
</properties>

<dependencies>
Expand Down Expand Up @@ -74,7 +75,7 @@
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>4.1.2</version>
<version>4.1.2</version>
</dependency>
<!-- Note: Apache xmlbeans v4.x and v5.x is incompatible with Apache poi v4-->
<dependency>
Expand Down Expand Up @@ -233,6 +234,5 @@
<artifactId>ant</artifactId>
<version>1.10.13</version>
</dependency>

</dependencies>
</project>
13 changes: 13 additions & 0 deletions whiterabbit/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@
<type>pom</type>
<scope>import</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.11.0</version>
</dependency>
</dependencies>
</dependencyManagement>

Expand All @@ -80,6 +86,13 @@
<artifactId>slf4j-simple</artifactId>
<version>1.7.30</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.11.0</version>
</dependency>


<!-- https://mvnrepository.com/artifact/org.testcontainers/testcontainers -->
<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ else if (iniFile.get("DATA_TYPE").equalsIgnoreCase("BigQuery")) {
sourceDataScan.setMaxValues(maxValues);
sourceDataScan.setCalculateNumericStats(calculateNumericStats);
sourceDataScan.setNumStatsSamplerSize(numericStatsSamplerSize);
sourceDataScan.process(dbSettings, iniFile.get("WORKING_FOLDER") + "/ScanReport.xlsx");
sourceDataScan.process(dbSettings, iniFile.get("WORKING_FOLDER") + "/" + SourceDataScan.SCAN_REPORT_FILE_NAME);
}

private JComponent createTabsPanel() {
Expand Down Expand Up @@ -543,7 +543,7 @@ private JPanel createFakeDataPanel() {
folderPanel.setLayout(new BoxLayout(folderPanel, BoxLayout.X_AXIS));
folderPanel.setBorder(BorderFactory.createTitledBorder("Scan report file"));
scanReportFileField = new JTextField();
scanReportFileField.setText((new File("ScanReport.xlsx").getAbsolutePath()));
scanReportFileField.setText((new File(SourceDataScan.SCAN_REPORT_FILE_NAME).getAbsolutePath()));
scanReportFileField.setToolTipText("The path to the scan report that will be used as a template to generate the fake data");
folderPanel.add(scanReportFileField);
JButton pickButton = new JButton("Pick file");
Expand Down Expand Up @@ -1051,7 +1051,7 @@ public void run() {
table = folderField.getText() + "/" + table;
dbSettings.tables.add(table);
}
sourceDataScan.process(dbSettings, folderField.getText() + "/ScanReport.xlsx");
sourceDataScan.process(dbSettings, folderField.getText() + "/" + SourceDataScan.SCAN_REPORT_FILE_NAME);
}
} catch (Exception e) {
handleError(e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
package org.ohdsi.whiteRabbit.scan;

import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.rmi.RemoteException;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.time.LocalDate;
Expand All @@ -29,11 +33,13 @@
import com.epam.parso.SasFileProperties;
import com.epam.parso.SasFileReader;
import com.epam.parso.impl.SasFileReaderImpl;
import org.apache.commons.lang.StringUtils;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.xssf.streaming.SXSSFWorkbook;
import org.apache.commons.io.FileUtils;
import org.ohdsi.databases.DbType;
import org.ohdsi.databases.RichConnection;
import org.ohdsi.databases.RichConnection.QueryResult;
Expand All @@ -49,10 +55,15 @@

public class SourceDataScan {

public static int MAX_VALUES_IN_MEMORY = 100000;
public static int MIN_CELL_COUNT_FOR_CSV = 1000000;
public static int N_FOR_FREE_TEXT_CHECK = 1000;
public static int MIN_AVERAGE_LENGTH_FOR_FREE_TEXT = 100;
public static int MAX_VALUES_IN_MEMORY = 100000;
public static int MIN_CELL_COUNT_FOR_CSV = 1000000;
public static int N_FOR_FREE_TEXT_CHECK = 1000;
public static int MIN_AVERAGE_LENGTH_FOR_FREE_TEXT = 100;

public final static String SCAN_REPORT_FILE_NAME = "ScanReport.xlsx";

public static final String POI_TMP_DIR_ENVIRONMENT_VARIABLE_NAME = "ORG_OHDSI_WHITERABBIT_POI_TMPDIR";
public static final String POI_TMP_DIR_PROPERTY_NAME = "org.ohdsi.whiterabbit.poi.tmpdir";

private SXSSFWorkbook workbook;
private char delimiter = ',';
Expand All @@ -70,6 +81,15 @@ public class SourceDataScan {

private LocalDateTime startTimeStamp;

static final String poiTmpPath;

static {
try {
poiTmpPath = setUniqueTempDirStrategyForApachePoi();
} catch (IOException e) {
throw new RuntimeException(e);
}
}

public void setSampleSize(int sampleSize) {
// -1 if sample size is not restricted
Expand Down Expand Up @@ -117,6 +137,78 @@ public void process(DbSettings dbSettings, String outputFileName) {
generateReport(outputFileName);
}

/*
* Implements a strategy for the tmp dir to ise for files for apache poi
* Attempts to solve an issue where some users report not having write access to the poi tmp dir
* (see https://github.com/OHDSI/WhiteRabbit/issues/293). Vry likely this is caused by the poi tmp dir
* being created on a multi-user system by a user with a too restrictive file mask.
*/
public static String setUniqueTempDirStrategyForApachePoi() throws IOException {
Path myTmpDir = getDefaultPoiTmpPath(FileUtils.getTempDirectory().toPath());
String userConfiguredPoiTmpDir = getUserConfiguredPoiTmpDir();
if (!StringUtils.isEmpty(userConfiguredPoiTmpDir)) {
myTmpDir = setupTmpDir(Paths.get(userConfiguredPoiTmpDir));
} else {
if (isNotWritable(myTmpDir)) {
// avoid the poi files directory entirely by creating a separate directory in the standard tmp dir
myTmpDir = setupTmpDir(FileUtils.getTempDirectory().toPath());
}
}

String tmpDir = myTmpDir.toFile().getAbsolutePath();
checkWritableTmpDir(tmpDir);
return tmpDir;
}

public static Path getDefaultPoiTmpPath(Path tmpRoot) {
// TODO: if/when updating poi to 5.x or higher, use DefaultTempFileCreationStrategy.POIFILES instead of a string literal
final String poiFilesDir = "poifiles"; // copied from poi implementation 4.x
return tmpRoot.resolve(poiFilesDir);
}

private static Path setupTmpDir(Path tmpDir) {
checkWritableTmpDir(tmpDir.toFile().getAbsolutePath());
Path myTmpDir = Paths.get(tmpDir.toFile().getAbsolutePath(), UUID.randomUUID().toString());
try {
Files.createDirectory(myTmpDir);
org.apache.poi.util.TempFile.setTempFileCreationStrategy(new org.apache.poi.util.DefaultTempFileCreationStrategy(myTmpDir.toFile()));
} catch (IOException ioException) {
throw new RuntimeException(String.format("Exception while creating directory %s", myTmpDir), ioException);
}
return myTmpDir;
}

private static void checkWritableTmpDir(String dir) {
if (isNotWritable(Paths.get(dir))) {
String message = String.format("Directory %s is not writable! (used for tmp files for Apache POI)", dir);
System.out.println(message);
throw new RuntimeException(message);
}
}

private static String getUserConfiguredPoiTmpDir() {
// search for a user configured dir for poi tmp files. Env.var. overrules Java property.
String userConfiguredDir = System.getenv(POI_TMP_DIR_ENVIRONMENT_VARIABLE_NAME);
if (StringUtils.isEmpty(userConfiguredDir)) {
userConfiguredDir = System.getProperty(POI_TMP_DIR_PROPERTY_NAME);
}
return userConfiguredDir;
}

public static boolean isNotWritable(Path path) {
final Path testFile = path.resolve("test.txt");
if (Files.exists(path) && Files.isDirectory(path)) {
try {
Files.createFile(testFile);
Files.delete(testFile);
} catch (IOException e) {
return true;
}
return false;
}
return true;
}

private void processDatabase(DbSettings dbSettings) {
// GBQ requires database. Put database value into domain var
if (dbSettings.dbType == DbType.BIGQUERY) {
Expand Down Expand Up @@ -486,11 +578,11 @@ else if (dbType == DbType.MSSQL || dbType == DbType.PDW) {
trimmedDatabase = database.substring(1, database.length() - 1);
String[] parts = table.split("\\.");
query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_CATALOG='" + trimmedDatabase + "' AND TABLE_SCHEMA='" + parts[0] +
"' AND TABLE_NAME='" + parts[1] + "';";
"' AND TABLE_NAME='" + parts[1] + "';";
} else if (dbType == DbType.AZURE) {
String[] parts = table.split("\\.");
query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA='" + parts[0] +
"' AND TABLE_NAME='" + parts[1] + "';";
"' AND TABLE_NAME='" + parts[1] + "';";
} else if (dbType == DbType.MYSQL)
query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '" + database + "' AND TABLE_NAME = '" + table
+ "';";
Expand All @@ -500,8 +592,7 @@ else if (dbType == DbType.POSTGRESQL || dbType == DbType.REDSHIFT)
else if (dbType == DbType.TERADATA) {
query = "SELECT ColumnName, ColumnType FROM dbc.columns WHERE DatabaseName= '" + database.toLowerCase() + "' AND TableName = '"
+ table.toLowerCase() + "';";
}
else if (dbType == DbType.BIGQUERY) {
} else if (dbType == DbType.BIGQUERY) {
query = "SELECT column_name AS COLUMN_NAME, data_type as DATA_TYPE FROM " + database + ".INFORMATION_SCHEMA.COLUMNS WHERE table_name = \"" + table + "\";";
}

Expand Down Expand Up @@ -735,7 +826,6 @@ public void processValue(String value) {
samplingReservoir.add(DateUtilities.parseDate(trimValue));
}
}

}

public List<Pair<String, Integer>> getSortedValuesWithoutSmallValues() {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
package org.ohdsi.whiterabbit.scan;

import org.ohdsi.databases.DbType;
import org.ohdsi.databases.RichConnection;
import org.ohdsi.ooxml.ReadXlsxFileWithHeader;
import org.ohdsi.utilities.files.Row;
import org.ohdsi.utilities.files.RowUtilities;
import org.ohdsi.whiteRabbit.DbSettings;
import org.testcontainers.containers.PostgreSQLContainer;

import java.io.File;
import java.io.FileInputStream;
Expand Down Expand Up @@ -98,4 +101,27 @@ else if (dbType == DbType.ORACLE){
throw new RuntimeException("Unsupported DBType: " + dbType);
}
}

static DbSettings getTestPostgreSQLSettings(PostgreSQLContainer<?> container) {
DbSettings dbSettings = new DbSettings();
dbSettings.dbType = DbType.POSTGRESQL;
dbSettings.sourceType = DbSettings.SourceType.DATABASE;
dbSettings.server = container.getJdbcUrl();
dbSettings.database = "public"; // yes, really
dbSettings.user = container.getUsername();
dbSettings.password = container.getPassword();
dbSettings.tables = getTableNamesPostgreSQL(dbSettings);

return dbSettings;
}

static List<String> getTableNamesPostgreSQL(DbSettings dbSettings) {
try (RichConnection richConnection = new RichConnection(dbSettings.server, dbSettings.domain, dbSettings.user, dbSettings.password, dbSettings.dbType)) {
return richConnection.getTableNames("public");
}
}




}
Loading

0 comments on commit 490350e

Please sign in to comment.