Skip to content

Commit 929fbb7

Browse files
committed
v1.0
1 parent dffc559 commit 929fbb7

29 files changed

+771359
-580
lines changed

.gitignore

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
1-
config.js
1+
# node 模块
2+
/node_modules
23

4+
# 配置文件
5+
config.*

index.js

+33-17
Original file line numberDiff line numberDiff line change
@@ -13,39 +13,55 @@ const child_process = require('child_process');
1313
const pythonTool = path.join(__dirname, 'tools', 'pdf2txt.py');
1414

1515

16-
// translation source files
16+
// source pdf files
1717
const sourceFiles = fs.readdirSync(path.join(__dirname, 'pdf'))
1818
.filter(base => path.parse(base).ext === '.pdf');
1919

2020

2121
// translating
2222
sourceFiles.forEach(file => {
23-
// path
24-
const name = path.parse(file).name;
25-
const pdfPath = path.join(__dirname, 'pdf', name + '.pdf');
26-
const xmlPath = path.join(__dirname, 'xml', name + '.xml');
27-
const jsonPath = path.join(__dirname, 'json', name + '.json');
28-
const txtPath = path.join(__dirname, 'txt', name + '.txt');
29-
const mdPath = path.join(__dirname, 'md', name + '.md');
23+
// 给学术论文pdf改名,去掉所有空格
24+
const pathObj = path.parse(file);
25+
const newName = pathObj.name.replace(/[\s]/g, '-');
26+
const newBase = newName + pathObj.ext;
27+
fs.renameSync(path.join(__dirname, 'pdf', file), path.join(__dirname, 'pdf', newBase));
28+
29+
30+
const pdfPath = path.join(__dirname, 'pdf', newName + '.pdf');
31+
const xmlPath = path.join(__dirname, 'xml', newName + '.xml');
32+
const jsonPath = path.join(__dirname, 'json', newName + '.json');
33+
const txtPath = path.join(__dirname, 'txt', newName + '.txt');
34+
const mdPath = path.join(__dirname, 'md', newName + '.md');
3035

3136

3237
// 1. pdf to xml
33-
console.log(`Transforming and parsing ${name}.pdf ...`);
38+
console.log(`Transforming ${newName}.pdf to ${newName}.xml...`);
3439
child_process.execSync(`python ${pythonTool} -o ${xmlPath} ${pdfPath}`);
3540

3641

37-
// 2. xml to txt
38-
console.log(`Extracting structural information from ${name}.pdf ...`);
39-
child_process.execSync(`node ./scripts/xml2txt.js ${xmlPath} ${jsonPath} ${txtPath}`);
42+
// 2. xml to json
43+
console.log(`Transforming ${newName}.xml to ${newName}.json...`);
44+
child_process.execSync(`node ./scripts/xml2json.js ${xmlPath} ${jsonPath}`);
4045

4146

47+
// 3. json to txt
48+
console.log(`Transforming ${newName}.json to ${newName}.txt...`);
49+
child_process.execSync(`node ./scripts/json2txt.js ${jsonPath} ${txtPath}`);
4250

43-
// 3. translation,两种写法均可
44-
// const stream = child_process.execSync('node ./scripts/translate.js', { encoding: 'utf8' });
45-
// console.log(stream);
4651

52+
// 4. 提取摘要和结论
53+
console.log(`Extracting structural information from ${newName}.txt to ${newName}.md...`);
54+
const info = child_process.execSync(`node ./scripts/txt2md.js ${txtPath} ${mdPath}`, { encoding: 'utf8' });
55+
console.log(info);
4756

57+
// 5. translation,两种写法均可
58+
// 要实时显示程序输出,得用异步版的.spawn()
59+
// 但异步版的.spawn()堵塞不住主进程,如果有多个文件需要翻译,循环中,主进程继续运行到.spawn()就会再开一个不堵塞的异步子进程。这样就可以出现多个子进程同时访问服务器,虽然翻译地快,但容易被封杀,特别是文件多的时候。
60+
// const output = child_process.execSync(`node ./scripts/translate.js ${mdPath}`, { encoding: 'utf8' });
61+
// console.log(output);
4862

49-
// const stream = child_process.spawnSync('node', ['./scripts/translate.js', txtPath, 'en', 'zh', 'md'], { encoding: 'utf8' }); // 这是一个流
50-
// console.log(stream.stdout); // 持续输出这个流
63+
const stream = child_process.spawn('node', ['./scripts/translate.js', mdPath], { shell: true });
64+
stream.stdout.on('data', data => {
65+
console.log(`Translating: ${data}`);
66+
});
5167
});

index.py

-46
This file was deleted.

0 commit comments

Comments
 (0)