@@ -13,39 +13,55 @@ const child_process = require('child_process');
13
13
const pythonTool = path . join ( __dirname , 'tools' , 'pdf2txt.py' ) ;
14
14
15
15
16
- // translation source files
16
+ // source pdf files
17
17
const sourceFiles = fs . readdirSync ( path . join ( __dirname , 'pdf' ) )
18
18
. filter ( base => path . parse ( base ) . ext === '.pdf' ) ;
19
19
20
20
21
21
// translating
22
22
sourceFiles . forEach ( file => {
23
- // path
24
- const name = path . parse ( file ) . name ;
25
- const pdfPath = path . join ( __dirname , 'pdf' , name + '.pdf' ) ;
26
- const xmlPath = path . join ( __dirname , 'xml' , name + '.xml' ) ;
27
- const jsonPath = path . join ( __dirname , 'json' , name + '.json' ) ;
28
- const txtPath = path . join ( __dirname , 'txt' , name + '.txt' ) ;
29
- const mdPath = path . join ( __dirname , 'md' , name + '.md' ) ;
23
+ // 给学术论文pdf改名,去掉所有空格
24
+ const pathObj = path . parse ( file ) ;
25
+ const newName = pathObj . name . replace ( / [ \s ] / g, '-' ) ;
26
+ const newBase = newName + pathObj . ext ;
27
+ fs . renameSync ( path . join ( __dirname , 'pdf' , file ) , path . join ( __dirname , 'pdf' , newBase ) ) ;
28
+
29
+
30
+ const pdfPath = path . join ( __dirname , 'pdf' , newName + '.pdf' ) ;
31
+ const xmlPath = path . join ( __dirname , 'xml' , newName + '.xml' ) ;
32
+ const jsonPath = path . join ( __dirname , 'json' , newName + '.json' ) ;
33
+ const txtPath = path . join ( __dirname , 'txt' , newName + '.txt' ) ;
34
+ const mdPath = path . join ( __dirname , 'md' , newName + '.md' ) ;
30
35
31
36
32
37
// 1. pdf to xml
33
- console . log ( `Transforming and parsing ${ name } .pdf ...` ) ;
38
+ console . log ( `Transforming ${ newName } .pdf to ${ newName } .xml ...` ) ;
34
39
child_process . execSync ( `python ${ pythonTool } -o ${ xmlPath } ${ pdfPath } ` ) ;
35
40
36
41
37
- // 2. xml to txt
38
- console . log ( `Extracting structural information from ${ name } .pdf ...` ) ;
39
- child_process . execSync ( `node ./scripts/xml2txt .js ${ xmlPath } ${ jsonPath } ${ txtPath } ` ) ;
42
+ // 2. xml to json
43
+ console . log ( `Transforming ${ newName } .xml to ${ newName } .json ...` ) ;
44
+ child_process . execSync ( `node ./scripts/xml2json .js ${ xmlPath } ${ jsonPath } ` ) ;
40
45
41
46
47
+ // 3. json to txt
48
+ console . log ( `Transforming ${ newName } .json to ${ newName } .txt...` ) ;
49
+ child_process . execSync ( `node ./scripts/json2txt.js ${ jsonPath } ${ txtPath } ` ) ;
42
50
43
- // 3. translation,两种写法均可
44
- // const stream = child_process.execSync('node ./scripts/translate.js', { encoding: 'utf8' });
45
- // console.log(stream);
46
51
52
+ // 4. 提取摘要和结论
53
+ console . log ( `Extracting structural information from ${ newName } .txt to ${ newName } .md...` ) ;
54
+ const info = child_process . execSync ( `node ./scripts/txt2md.js ${ txtPath } ${ mdPath } ` , { encoding : 'utf8' } ) ;
55
+ console . log ( info ) ;
47
56
57
+ // 5. translation,两种写法均可
58
+ // 要实时显示程序输出,得用异步版的.spawn()
59
+ // 但异步版的.spawn()堵塞不住主进程,如果有多个文件需要翻译,循环中,主进程继续运行到.spawn()就会再开一个不堵塞的异步子进程。这样就可以出现多个子进程同时访问服务器,虽然翻译地快,但容易被封杀,特别是文件多的时候。
60
+ // const output = child_process.execSync(`node ./scripts/translate.js ${mdPath}`, { encoding: 'utf8' });
61
+ // console.log(output);
48
62
49
- // const stream = child_process.spawnSync('node', ['./scripts/translate.js', txtPath, 'en', 'zh', 'md'], { encoding: 'utf8' }); // 这是一个流
50
- // console.log(stream.stdout); // 持续输出这个流
63
+ const stream = child_process . spawn ( 'node' , [ './scripts/translate.js' , mdPath ] , { shell : true } ) ;
64
+ stream . stdout . on ( 'data' , data => {
65
+ console . log ( `Translating: ${ data } ` ) ;
66
+ } ) ;
51
67
} ) ;
0 commit comments