This repository has been archived by the owner on Jan 13, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
demo.properties
163 lines (129 loc) · 6.45 KB
/
demo.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#####Elasticsearch CONFIGURATION####
#ES cluster name
elasticsearch.cluster.name = elasticsearch
#ES cluster IP/hostname
elasticsearch.cluster.host = localhost
#ES cluster transport port
elasticsearch.cluster.port = 9200
#ES basic auth from X-pack security plugin (commercial)
elasticsearch.security.enabled = false
#for Http auth
elasticsearch.security.user = elastic
elasticsearch.security.password = changeme
#ES ssl from X-pack security plugin (commercial). Warning. passwords will be sent in plain text if
#ssl is not enabled
elasticsearch.ssl.enabled = false
#Shield username/password. Depreciated
#elasticsearch.shield.user = <user>:<password>
# Enable Shield. Depreciated
#elasticsearch.shield.enabled = false
#ES keystore for ssl
#elasticsearch.shield.ssl.keystore.path = /home/rich/elk/ssh-keystores/tempnode/tempnode.jks
##ES keystore password
#elasticsearch.shield.ssl.keystore.password = <password>
##ES Truststore (see Shield docs)
#elasticsearch.shield.ssl.truststore.path = /home/rich/elk/ssh-keystores/tempnode/tempnode.jks
##ES Truststore password
#elasticsearch.shield.ssl.truststore.password = <password>
##use encryption on transport layer
#elasticsearch.shield.transport.ssl = true
##General ES options
#load data into this index
elasticsearch.index.name = demo_index
#load data into this type
elasticsearch.type = demo_type
#Allow this many ms for cluster response
elasticsearch.response.timeout = 1000000
#If the input SQL query returns columns with these labels, ignore them
elasticsearch.excludeFromIndexing = binaryContent,primaryKeyFieldName
#specify the JODA date pattern that is compatible with the elasticsearch dynamic file mapping for dates (see ES docs on dates)
elasticsearch.datePattern = yyyy-MM-dd'T'HH:mm:ss.SSS
#####JOB AND STEP CONFIGURATION####
# commit interval in step - process this many rows before committing results. default 50
step.chunkSize = 50
#number of exceptions that can be thrown before job fails. default 5
step.skipLimit = 5
#Asynchonous TaskExecutor Thread pool size - for multithreading partitions
step.concurrencyLimit = 2
#job should complete before this many ms or it will be listed as a failure.
job.jobTimeout = 10000000
#globally unique job name. default is defaultJob
job.jobName = exampleJob
#since some JDBC drivers don't support socket timeouts, they can't detect network failures.
#This workaround allows us to specify a global socket timeout for the JVM without diving into the OS socket configuration.
#However, the downside is that all sockets are affected in the JVM, so may not always be appropriate to set this. Comment out to
#not use this global setting
job.globalSocketTimeout = 30000
#Partitioner Configuration. This is used to inform how the total row count per job should be broken into
#seperate partitions
#Two partitioner types are available, either using primary keys (PK) or timestamps and primary keys (PKTimeStamp)
#If using the scheduler, the PKTimeStamp type should be configured
partitioner.partitionType = PKTimeStamp
#number of partitions to generate (x / total job row count)
partitioner.gridSize = 1
#ms for partition handler to complete partitioning before an exception is thrown
partitioner.partitionHandlerTimeout = 10000000
#name of timestamp column used for partitioning and checking for new data (only if scheduling is used)
partitioner.timeStampColumnName = DocumentTimestamp
#name of PK column used for partitioning and checking for new data
#only use with scheduling if PKs are guaranteed to be generated sequentially
partitioner.pkColumnName = FileID
#this is the table containing the primary keys and optionally, timestamps
partitioner.tableToPartition = demoData
# required prefix on some types of DB (e.g. SQL Server)
partitioner.preFieldsSQL = TOP 100 PERCENT
#Scheduling config
#if false, run a single job and then exit process
scheduler.useScheduling = false
##For meaning of ideTimeout and maxLifetime see Hikari Connection Pool docs
## SQLSERVER TARGET DB CONFIGURATIONS
target.JdbcPath = jdbc:sqlserver://localhost:1433;DatabaseName=master
target.Driver = com.microsoft.sqlserver.jdbc.SQLServerDriver
target.username = sa
target.password = yourStrong(!)Password
target.idleTimeout = 30000
target.maxLifetime = 60000
## SOURCE TARGET DB CONFIGURATIONS
source.JdbcPath = jdbc:sqlserver://localhost:1433;DatabaseName=master
source.Driver = com.microsoft.sqlserver.jdbc.SQLServerDriver
source.username = sa
source.password = yourStrong(!)Password
source.idleTimeout = 30000
source.maxLifetime = 60000
## Job Repo DB CONFIGURATIONS
jobRepository.JdbcPath = jdbc:sqlserver://localhost:1433;DatabaseName=master
jobRepository.Driver = com.microsoft.sqlserver.jdbc.SQLServerDriver
jobRepository.username = sa
jobRepository.password = yourStrong(!)Password
jobRepository.idleTimeout = 30000
jobRepository.maxLifetime = 60000
#Since different DBMS products interpret the SQL standard for time differently, is is necessary to explicitly specify
#the date type that the database is using. E.G. postgres=TIMESTAMP, SQL SERVER=DATETIME
source.dbmsToJavaSqlTimestampType = DATETIME
#The principle SQL block that specifies data to process. Composed of three parts.
source.selectClause = SELECT FileID, DocumentName, DocumentTimestamp, FilePath, 'demoData' AS tlSrcTableName, 'FilePath' AS tlSrcColumnFieldName, 'FileID' AS tlPrimaryKeyFieldName
source.sortKey = FileID
source.fromClause = FROM master.dbo.demoData
#If writing the cogstack constructed JSON to another DBMS as well as elasticsearch, this query specifies how it should be handled
target.Sql =
##paging item reader configuration
#number of rows per page
source.pageSize = 2
# DB column label mapping for table metadata. Required.
source.srcTableName = tlSrcTableName
source.srcColumnFieldName = tlSrcColumnFieldName
source.primaryKeyFieldName = tlPrimaryKeyFieldName
source.primaryKeyFieldValue = FileID
source.timeStamp = DocumentTimestamp
#####TIKA CONFIGURATION####
#TikaItemProcesser - XHTML or plaintext
tika.keepTags = false
# field with binary content for tika conversion, or field containing path name
tika.binaryFieldName = FilePath
# fieldname for tika output
tika.tikaFieldName = tikaOutput
# either database or fileSystemWithDBPath for Docman type systems
tika.binaryContentSource = fileSystemWithDBPath
#drive prefix (tests use file on classpath)
tika.binaryPathPrefix = file:/home/ubuntu/cogstack-master/src/test/resources/tika/testdocs/
spring.profiles.active=basic,elasticsearchRest,localPartitioning,jdbc_in,tika