demo.properties

#####Elasticsearch CONFIGURATION####
#ES cluster name
elasticsearch.cluster.name = elasticsearch
#ES cluster IP/hostname
elasticsearch.cluster.host =  localhost
#ES cluster transport port
elasticsearch.cluster.port = 9200

#ES basic auth from X-pack security plugin (commercial)
elasticsearch.security.enabled = false
#for Http auth
elasticsearch.security.user = elastic
elasticsearch.security.password = changeme

#ES ssl from X-pack security plugin (commercial). Warning. passwords will be sent in plain text if
#ssl is not enabled
elasticsearch.ssl.enabled = false


#Shield username/password. Depreciated
#elasticsearch.shield.user = <user>:<password>
# Enable Shield. Depreciated
#elasticsearch.shield.enabled = false
#ES keystore for ssl
#elasticsearch.shield.ssl.keystore.path = /home/rich/elk/ssh-keystores/tempnode/tempnode.jks
##ES keystore password
#elasticsearch.shield.ssl.keystore.password = <password>
##ES Truststore (see Shield docs)
#elasticsearch.shield.ssl.truststore.path = /home/rich/elk/ssh-keystores/tempnode/tempnode.jks
##ES Truststore password
#elasticsearch.shield.ssl.truststore.password = <password>
##use encryption on transport layer
#elasticsearch.shield.transport.ssl = true

##General ES options
#load data into this index
elasticsearch.index.name = demo_index
#load data into this type
elasticsearch.type = demo_type
#Allow this many ms for cluster response
elasticsearch.response.timeout = 1000000
#If the input SQL query returns columns with these labels, ignore them
elasticsearch.excludeFromIndexing = binaryContent,primaryKeyFieldName
#specify the JODA date pattern that is compatible with the elasticsearch dynamic file mapping for dates (see ES docs on dates)
elasticsearch.datePattern = yyyy-MM-dd'T'HH:mm:ss.SSS
#####JOB AND STEP CONFIGURATION####
# commit interval in step - process this many rows before committing results. default 50
step.chunkSize = 50
#number of exceptions that can be thrown before job fails. default 5
step.skipLimit = 5

#Asynchonous TaskExecutor Thread pool size - for multithreading partitions
step.concurrencyLimit = 2

#job should complete before this many ms or it will be listed as a failure.
job.jobTimeout = 10000000

#globally unique job name. default is defaultJob
job.jobName = exampleJob

#since some JDBC drivers don't support socket timeouts, they can't detect network failures.
#This workaround allows us to specify a global socket timeout for the JVM without diving into the OS socket configuration.
#However, the downside is that all sockets are affected in the JVM, so may not always be appropriate to set this. Comment out to
#not use this global setting
job.globalSocketTimeout = 30000


#Partitioner Configuration. This is used to inform how the total row count per job should be broken into
#seperate partitions

#Two partitioner types are available, either using primary keys (PK) or timestamps and primary keys (PKTimeStamp)
#If using the scheduler, the PKTimeStamp type should be configured
partitioner.partitionType = PKTimeStamp

#number of partitions to generate (x / total job row count)
partitioner.gridSize = 1

#ms for partition handler to complete partitioning before an exception is thrown
partitioner.partitionHandlerTimeout = 10000000

#name of timestamp column used for partitioning and checking for new data (only if scheduling is used)
partitioner.timeStampColumnName = DocumentTimestamp

#name of PK column used for partitioning and checking for new data
#only use with scheduling if PKs are guaranteed to be generated sequentially
partitioner.pkColumnName = FileID

#this is the table containing the primary keys and optionally, timestamps
partitioner.tableToPartition = demoData

# required prefix on some types of DB (e.g. SQL Server)
partitioner.preFieldsSQL = TOP 100 PERCENT
#Scheduling config
#if false, run a single job and then exit process
scheduler.useScheduling = false


##For meaning of ideTimeout and maxLifetime see Hikari Connection Pool docs

## SQLSERVER TARGET DB CONFIGURATIONS
target.JdbcPath      = jdbc:sqlserver://localhost:1433;DatabaseName=master
target.Driver        = com.microsoft.sqlserver.jdbc.SQLServerDriver
target.username      = sa
target.password      = yourStrong(!)Password
target.idleTimeout   = 30000
target.maxLifetime   = 60000

## SOURCE TARGET DB CONFIGURATIONS
source.JdbcPath      = jdbc:sqlserver://localhost:1433;DatabaseName=master
source.Driver        = com.microsoft.sqlserver.jdbc.SQLServerDriver
source.username      = sa
source.password      = yourStrong(!)Password
source.idleTimeout   = 30000
source.maxLifetime   = 60000

## Job Repo DB CONFIGURATIONS
jobRepository.JdbcPath      = jdbc:sqlserver://localhost:1433;DatabaseName=master
jobRepository.Driver        = com.microsoft.sqlserver.jdbc.SQLServerDriver
jobRepository.username      = sa
jobRepository.password      = yourStrong(!)Password
jobRepository.idleTimeout   = 30000
jobRepository.maxLifetime   = 60000

#Since different DBMS products interpret the SQL standard for time differently, is is necessary to explicitly specify
#the date type that the database is using. E.G. postgres=TIMESTAMP, SQL SERVER=DATETIME
source.dbmsToJavaSqlTimestampType = DATETIME


#The principle SQL block that specifies data to process. Composed of three parts.
source.selectClause = SELECT FileID, DocumentName, DocumentTimestamp, FilePath, 'demoData' AS tlSrcTableName, 'FilePath' AS tlSrcColumnFieldName, 'FileID' AS tlPrimaryKeyFieldName
source.sortKey = FileID
source.fromClause = FROM master.dbo.demoData

#If writing the cogstack constructed JSON to another DBMS as well as elasticsearch, this query specifies how it should be handled
target.Sql =


##paging item reader configuration
#number of rows per page
source.pageSize = 2

# DB column label mapping for table metadata. Required.
source.srcTableName = tlSrcTableName
source.srcColumnFieldName = tlSrcColumnFieldName
source.primaryKeyFieldName = tlPrimaryKeyFieldName
source.primaryKeyFieldValue = FileID
source.timeStamp = DocumentTimestamp

#####TIKA CONFIGURATION####

#TikaItemProcesser - XHTML or plaintext
tika.keepTags = false
# field with binary content for tika conversion, or field containing path name
tika.binaryFieldName = FilePath
# fieldname for tika output
tika.tikaFieldName = tikaOutput
# either database or fileSystemWithDBPath for Docman type systems
tika.binaryContentSource = fileSystemWithDBPath
#drive prefix (tests use file on classpath)
tika.binaryPathPrefix = file:/home/ubuntu/cogstack-master/src/test/resources/tika/testdocs/

spring.profiles.active=basic,elasticsearchRest,localPartitioning,jdbc_in,tika