update the docs

Qihoo360 · Jan 30, 2018 · b96caa2 · b96caa2
1 parent 8555c75
commit b96caa2
Show file tree

Hide file tree

Showing 7 changed files with 55 additions and 36 deletions.
diff --git a/doc/configure.md b/doc/configure.md
@@ -17,31 +17,36 @@ xlearning.ps.num | 0 | number of ps containers to use for the application
 xlearning.ps.memory | 1024MB | amount of memory to use for the ps process  
 xlearning.ps.cores | 1 | number of cores to use for the ps process   
 xlearning.app.queue | DEFAULT | the queue which application submitted to  
-xlearning.app.priority | 3 | the priority of the application, divided into level 0 to 5, corresponding to DEFAULT, VERY\_LOW, LOW, NORMAL, HIGH, VERY\_HIGH
+xlearning.app.priority | 3 | the priority of the application, divided into level 0 to 5, corresponding to DEFAULT, VERY\_LOW, LOW, NORMAL, HIGH, VERY\_HIGH  
 xlearning.input.strategy | DOWNLOAD | loading strategy of input file, including DOWNLOAD, STREAM, PLACEHOLDER  
 xlearning.inputfile.rename | false | whether to rename the download file in the DOWNLOAD strategy of input file  
 xlearning.stream.epoch | 1 | the number of the input file loading in the STREAM strategy of input file  
 xlearning.input.stream.shuffle | false | whether to shuffle the input splits in the STREAM strategy of input file  
 xlearning.inputformat.class | org.apache.hadoop.mapred.TextInputFormat.class | which inputformat implementation to use in the STREAM strategy of input file   
+xlearning.inputformat.cache | false | whether cache the inputformat file to local when the stream epoch longer than 1  
+xlearning.inputformat.cachefile.name | inputformatCache.gz | the local cache file name for inputformat  
+xlearning.inputformat.cachesize.limit | 100*1024 | the limit size of the local cache file (in MB)   
 xlearning.output.local.dir | output | If the local output path is not specified, the local directory of the output file is the default value.  
-xlearning.output.strategy | UPLOAD | loading strategy of output file, including DOWNLOAD, STREAM
+xlearning.output.strategy | UPLOAD | loading strategy of output file, including DOWNLOAD, STREAM  
 xlearning.outputformat.class | TextMultiOutputFormat.class | which outputformat implementation to use in the STREAM strategy of output file  
 xlearning.interresult.dir | /interResult_ | specify the HDFS subdirectory that the intermediate output file upload to  
 xlearning.interresult.upload.timeout | 30 * 60 * 1000 | upload timeout to save the intermediate output (in milliseconds) 
 
 
 
-### TensorFlow Application Configuration  
-The type of the application is "TENSORFLOW"  
+### Board Service Configuration  
 
 Property Name | Default | Meaning  
 ---------------- | --------------- | ---------------  
-xlearning.tf.board.enable | true | If set to false, TensorBoard service is not necessary  
-xlearning.tf.board.worker.index | 0 | the index of the worker which start the service of TensorBoard  
-xlearning.tf.board.reload.interval | 1 | how often the backend should load more data of event log (in seconds)  
+xlearning.tf.board.enable | true | If set to false, Board service is not necessary  
+xlearning.tf.board.worker.index | 0 | the index of the worker which start the service of Board  
 xlearning.tf.board.log.dir | eventLog | the directory saving TensorBoard event log  
-xlearning.tf.board.history.dir | /tmp/XLearning/eventLog | specify the HDFS path which the TensorBoard event log upload to
-
+xlearning.tf.board.history.dir | /tmp/XLearning/eventLog | specify the HDFS path which the TensorBoard event log upload to  
+xlearning.tf.board.reload.interval | 1 | how often the backend should load more data of event log (in seconds) for tensorboard  
+xlearning.board.modelpb | "" | model proto in ONNX format for VisualDL  
+xlearning.board.cache.timeout | 20 | memory cache timeout duration in seconds for VisualDL  
+xlearning.tf.board.path | /bin/tensorboard | the path of the tensorboard  
+xlearning.board.path | /bin/visualDL | the path of the visualDL  
 
 
 ### System Configuration
@@ -53,10 +58,11 @@ xlearning.allocate.interval | 1000ms | interval between the AM get the container
 xlearning.status.update.interval | 1000ms | interval between the AM report the state to RM  
 xlearning.task.timeout | 5 * 60 * 1000 | communication timeout between the AM and container (in milliseconds) 
 xlearning.task.timeout.check.interval | 3 * 1000 | how often the AM check the timeout of the container (in milliseconds)  
+xlearning.localresource.timeout | 5 * 60 * 1000 | set the timeout of the download the localResources (in milliseconds)  
 xlearning.messages.len.max | 1000 | Maximum size (in bytes) of message queue  
 xlearning.execute.node.limit | 200 | Maximum number of nodes that application use  
 xlearning.staging.dir | /tmp/XLearning/staging | HDFS directory that application local resources upload to  
-xlearning.cleanup.enable | true | whether delete the resources after the application finished
+xlearning.cleanup.enable | true | whether delete the resources after the application finished  
 xlearning.container.maxFailures.rate | 0.5 | maximum percentage of the failure containers   
 xlearning.download.file.retry | 3 | Maximum number of retries for the input file download when the strategy of input file is DOWNLOAD  
 xlearning.download.file.thread.nums | 10 | number of download threads of the input file in the strategy of DOWNLOAD
@@ -69,6 +75,7 @@ xlearning.user.classpath.first | true |  whether user's job jar should be the fi
 xlearning.worker.mem.autoscale | 0.5 | automatic memory scale ratio of worker when application retry after failed.   
 xlearning.ps.mem.autoscale | 0.2 | automatic memory scale ratio of ps when application retry after failed.   
 xlearning.app.max.attempts | 1 | the number of application attempts， default not retry after failed.   
+xlearning.report.container.status | true | whether the client report the status of the container.  
 
 
 
@@ -80,7 +87,7 @@ Property Name | Default | Meaning
 xlearning.history.log.dir | /tmp/XLearning/history | the HDFS directory that saves the history log  
 xlearning.history.log.delete-monitor-time-interval | 24 * 60 * 60 * 1000 | set the time interval by which the application history logs will be checked to clean (in milliseconds)  
 xlearning.history.log.max-age-ms | 24 * 60 * 60 * 1000 | how long the history log can be saved (in milliseconds)  
-xlearning.history.port | 10021 | port for the history service
+xlearning.history.port | 10021 | port for the history service  
 xlearning.history.address | 0.0.0.0:10021 | address for the history service  
 xlearning.history.webapp.port | 19886 | port for the history http web service  
 xlearning.history.webapp.address | 0.0.0.0:19886 | address for the history http web service  

diff --git a/doc/configure_cn.md b/doc/configure_cn.md
@@ -13,34 +13,40 @@ xlearning.am.cores | 1 | AM申请所需CPU核数
 xlearning.worker.num | 1 | worker启动数目  
 xlearning.worker.memory | 1024 | worker申请使用内存大小，单位为MB  
 xlearning.worker.cores | 1 | worker申请使用CPU核数  
-xlearning.ps.num | 0 | ps启动数目，默认作业不使用ParameterServer机制
+xlearning.ps.num | 0 | ps启动数目，默认作业不使用ParameterServer机制  
 xlearning.ps.memory | 1024 | ps申请使用内存大小，默认单位为MB  
-xlearning.ps.cores | 1 | ps申请使用CPU核数
+xlearning.ps.cores | 1 | ps申请使用CPU核数  
 xlearning.app.queue | DEFAULT | 作业提交队列  
-xlearning.app.priority | 3 | 作业优先级，级别0-5，分别对应DEFAULT、VERY\_LOW、LOW、NORMAL、HIGH、VERY\_HIGH
+xlearning.app.priority | 3 | 作业优先级，级别0-5，分别对应DEFAULT、VERY\_LOW、LOW、NORMAL、HIGH、VERY\_HIGH  
 xlearning.input.strategy | DOWNLOAD | 输入文件加载模式，目前主要有DOWNLOAD、STREAM、PLACEHOLDER  
 xlearning.inputfile.rename | false | 输入文件下载至本地是否需要重命名，该选项只用于输入文件加载模式为DOWNLOAD时  
 xlearning.stream.epoch | 1 | 输入文件加载次数，该选项只用于输入文件加载策略为STREAM时  
 xlearning.input.stream.shuffle | false | 输入文件是否采用shuffle模式，该选项只用于输入文件加载模式为STREAM时  
-xlearning.inputformat.class | org.apache.hadoop.mapred.TextInputFormat.class | STREAM模式下，输入文件inputformat类指定
+xlearning.inputformat.class | org.apache.hadoop.mapred.TextInputFormat.class | STREAM模式下，输入文件inputformat类指定  
+xlearning.inputformat.cache | false | stream epoch大于1时，是否采用缓存至本地文件的操作  
+xlearning.inputformat.cachefile.name | inputformatCache.gz | inputformat缓存至本地的文件名称  
+xlearning.inputformat.cachesize.limit | 100*1024 | inputformat缓存于本地的文件大小上限，单位为MB  
 xlearning.output.local.dir | output | 输出文件本地默认路径，该选项只用于作业提交参数output未指定本地输出路径时  
-xlearning.output.strategy | UPLOAD | 输出文件加载策略，目前主要有DOWNLOAD、STREAM
-xlearning.outputformat.class | TextMultiOutputFormat.class | STREAM模式下，输出文件outputformat类指定
+xlearning.output.strategy | UPLOAD | 输出文件加载策略，目前主要有DOWNLOAD、STREAM  
+xlearning.outputformat.class | TextMultiOutputFormat.class | STREAM模式下，输出文件outputformat类指定  
 xlearning.interresult.dir | /interResult_ | 指定模型中间结果上传至HDFS子路径  
 xlearning.interresult.upload.timeout | 30 * 60 * 1000 | 模型中间结果上传至HDFS超时时长设置，单位为毫秒  
 
 
 
-### TensorFlow应用配置  
-以下配置项仅用于应用类型为TENSORFLOW时  
+### Board服务配置  
 
 配置名称 | 默认值 | 含义  
 ---------------- | --------------- | ---------------  
-xlearning.tf.board.enable | true | TensorBoard服务是否开启  
-xlearning.tf.board.worker.index | 0 | 指定开启TensorBoard服务所在的worker index  
+xlearning.tf.board.enable | true | Board服务是否开启  
+xlearning.tf.board.worker.index | 0 | 指定开启Board服务所在的worker index  
+xlearning.tf.board.log.dir | eventLog | 指定Board日志存放路径，默认为本地路径./eventLog  
+xlearning.tf.board.history.dir | /tmp/XLearning/eventLog | 指定Board日志上传至HDFS路径  
 xlearning.tf.board.reload.interval | 1 | 指定TensorBoard数据加载时间间隔，单位为秒  
-xlearning.tf.board.log.dir | eventLog | 指定TensorBoard日志存放路径，默认为本地路径./eventLog  
-xlearning.tf.board.history.dir | /tmp/XLearning/eventLog | 指定TensorBoard日志上传至HDFS路径  
+xlearning.board.modelpb | "" | 指定VisualDL加载的模型文件  
+xlearning.board.cache.timeout | 20 | 指定VisualDL缓存加载间隔，单位为秒  
+xlearning.tf.board.path | /bin/tensorboard | 指定TensorBoard服务路径  
+xlearning.board.path | /bin/visualDL | 指定VisualDL服务路径  
 
 
 
@@ -53,6 +59,7 @@ xlearning.allocate.interval | 1000 | AM获取RM分配container状态时间间隔
 xlearning.status.update.interval | 1000 | AM向RM汇报状态时间间隔，单位为毫秒  
 xlearning.task.timeout | 5 * 60 * 1000 | container超时时长，单位为毫秒  
 xlearning.task.timeout.check.interval | 3 * 1000 | container超时检查时间间隔，单位为毫秒  
+xlearning.localresource.timeout | 5 * 60 * 1000 | container下载本地资源超时时长，单位为毫秒  
 xlearning.messages.len.max | 1000 | 消息队列大小限制，单位为字节  
 xlearning.execute.node.limit | 200 | 作业申请节点数目上限  
 xlearning.staging.dir | /tmp/XLearning/staging | 作业本地资源上传至HDFS路径  
@@ -69,6 +76,7 @@ xlearning.user.classpath.first | true | 是否优先加载用户自定义jar包
 xlearning.worker.mem.autoscale | 0.5 | 作业失败重试时，worker内存自动增长比例   
 xlearning.ps.mem.autoscale | 0.2 | 作业失败重试时，ps内存自动增长比例   
 xlearning.app.max.attempts | 1 | 作业执行次数，默认执行失败后不重试   
+xlearning.report.container.status | true | client端打印container运行状态信息  
 
 
 
@@ -79,7 +87,7 @@ xlearning.app.max.attempts | 1 | 作业执行次数，默认执行失败后不
 xlearning.history.log.dir | /tmp/XLearning/history | history日志存放所在hdfs地址  
 xlearning.history.log.delete-monitor-time-interval | 24 * 60 * 60 * 1000 | history日志清理检测时间间隔，单位为毫秒  
 xlearning.history.log.max-age-ms | 24 * 60 * 60 * 1000 | history日志保存时长，单位为毫秒  
-xlearning.history.port | 10021 | history服务开放端口
+xlearning.history.port | 10021 | history服务开放端口  
 xlearning.history.address | 0.0.0.0:10021 | history服务开放地址  
 xlearning.history.webapp.port | 19886 | history服务web应用开放端口  
 xlearning.history.webapp.address | 0.0.0.0:19886 | history服务web应用开放地址  

diff --git a/doc/faq.md b/doc/faq.md
@@ -109,6 +109,6 @@ XLearning1.1 support the application retry and memory auto scaled after failed b
 - xlearning.ps.mem.autoscale  
 Note that the information of AM connected error which reported at the client when application retry can ignore. 
 
-### 7. Report the error：" java.io.IOException: Cannot run program "tensorboard": error=2, No such file or directory" after submit the application.       
+### 11. Report the error：" java.io.IOException: Cannot run program "tensorboard": error=2, No such file or directory" after submit the application.       
 When the XLearning client submits a job, the --user-path "/root/anaconda2/lib/python2.7/site-packages/tensorboard" is added to specify the tensorboard path.
 
diff --git a/doc/submit.md b/doc/submit.md
@@ -28,11 +28,13 @@ worker-cores | number of cores to use for the worker process, default as the con
 worker-memory | amount of memory to use for the worker process(in MB), default as the configure of xlearning.worker.memory  
 queue | the queue of application submitted to, default as the configure of xlearning.app.queue  
 priority | the priority of application, default as the configure of xlearning.app.priority  
-board-enable | whether to start the service of TensorBoard, default as the configure of xlearning.tf.board.enable  
-board-index | specify the index of worker which start the TensorBoard, default as the configure of xlearning.tf.board.worker.index  
-board-logdir | the directory save TensorBoard event log, default as the configure of xlearning.tf.board.log.dir  
-board-reloadinterval | how often the backend should load more data of event log, default as the configure of xlearning.tf.board.reload.interval  
-board-historydir | specify the HDFS path which the TensorBoard event log upload to, default as the configure of xlearning.tf.board.history.dir
+board-enable | whether to start the service of Board, default as the configure of xlearning.tf.board.enable  
+board-index | specify the index of worker which start the Board, default as the configure of xlearning.tf.board.worker.index  
+board-logdir | the directory save Board event log, default as the configure of xlearning.tf.board.log.dir  
+board-reloadinterval | how often the backend should load more data of event log for tensorboard, default as the configure of xlearning.tf.board.reload.interval  
+board-historydir | specify the HDFS path which the Board event log upload to, default as the configure of xlearning.tf.board.history.dir  
+board-modelpb | model proto in ONNX format for VisualDL, default as the configure of xlearning.board.modelpb  
+board-cacheTimeout | memory cache timeout duration in seconds for VisualDL，default as the configure of xlearning.board.cache.timeout  
 input-strategy | the strategy of the input file, default as the configure of xlearning.input.strategy  
 inRenameInputFile | whether to rename the download file when input-strategy is "DOWNLOAD", default as the configure of xlearning.inputfile.rename  
 stream-epoch | specify the epoch num of the input file read when input-strategy is "STREAM", default as the configure of xlearning.stream.epoch  

diff --git a/doc/submit_cn.md b/doc/submit_cn.md
@@ -28,11 +28,13 @@ worker-cores | 指定worker申请的CPU核数，默认个数为系统配置xlear
 worker-memory | 指定worker申请内存，默认单位为MB，默认大小为系统配置xlearning.worker.memory  
 queue | 指定作业提交队列，默认为系统配置xlearning.app.queue  
 priority | 指定作业提交优先级，默认为系统配置xlearning.app.priority对应级别  
-board-enable | 是否开启TensorBoard服务，默认为系统配置xlearning.tf.board.enable  
-board-index | 指定开启TensorBoard服务的work index，默认为系统配置xlearning.tf.board.worker.index  
-board-logdir | TensorBoard日志存放路径，默认为系统配置xlearning.tf.board.log.dir  
+board-enable | 是否开启Board服务，默认为系统配置xlearning.tf.board.enable  
+board-index | 指定开启Board服务的work index，默认为系统配置xlearning.tf.board.worker.index  
+board-logdir | Board日志存放路径，默认为系统配置xlearning.tf.board.log.dir  
 board-reloadinterval | TensorBoard数据加载时间间隔，默认为系统配置xlearning.tf.board.reload.interval  
-board-historydir | TensorBoard日志HDFS上传路径，默认为系统配置xlearning.tf.board.history.dir
+board-historydir | Board日志HDFS上传路径，默认为系统配置xlearning.tf.board.history.dir  
+board-modelpb | VisualDL加载的模型文件，默认为系统配置xlearning.board.modelpb  
+board-cacheTimeout | VisualDL加载缓存间隔时间，默认为系统配置xlearning.board.cache.timeout  
 input-strategy | 输入文件加载策略，默认为系统配置xlearning.input.strategy  
 inRenameInputFile | 当输入文件加载策略为DOWNLOAD时，设置是否对下载后的文件进行重命名，默认为系统配置xlearning.inputfile.rename  
 stream-epoch | 当输入文件加载策略为STREAM时，流式数据读取次数，默认为系统配置xlearning.stream.epoch  

diff --git a/pom.xml b/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>net.qihoo</groupId>
     <artifactId>xlearning</artifactId>
-    <version>1.1</version>
+    <version>1.2</version>
 
 
     <properties>

diff --git a/src/main/resources/META-INF/MANIFEST.MF b/src/main/resources/META-INF/MANIFEST.MF
@@ -1,4 +1,4 @@
-Manifest-Version: 1.1
+Manifest-Version: 1.2
 Created-By: net.qihoo.xitong
 Built-By: Qihoo
 Main-Class: net.qihoo.xlearning.client.Client