Skip to content

Commit e4bc177

Browse files
committed
feat: express 爬虫流程
1 parent de5fc13 commit e4bc177

File tree

10 files changed

+374
-0
lines changed

10 files changed

+374
-0
lines changed

.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,8 @@
1010
/typescript进阶/dist
1111
/typescript进阶/package-lock.json
1212
/typescript进阶/.parcel-cache
13+
14+
#crowller-code-express ignore
15+
/crowller-code-express/node_modules
16+
/crowller-code-express/build
17+
/crowller-code-express/package-lock.json
+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"1590472484271":[{"index":0,"title":"Vue2.5开发去哪儿网App 从零基础入门到实战项目"},{"index":1,"title":"React 16.4 开发简书项目 从零基础入门到实战"},{"index":2,"title":"Vue2.5开发去哪儿网App 从零基础入门到实战项目"},{"index":3,"title":"从基础到实战\t手把手带你掌握新版Webpack4.0"},{"index":4,"title":"Dell Lee 的微课堂,职业规划答疑解惑,精彩文章与你分享"}],"1590472490515":[{"index":0,"title":"Vue2.5开发去哪儿网App 从零基础入门到实战项目"},{"index":1,"title":"React 16.4 开发简书项目 从零基础入门到实战"},{"index":2,"title":"Vue2.5开发去哪儿网App 从零基础入门到实战项目"},{"index":3,"title":"从基础到实战\t手把手带你掌握新版Webpack4.0"},{"index":4,"title":"Dell Lee 的微课堂,职业规划答疑解惑,精彩文章与你分享"}],"1590473203446":[{"index":0,"title":"Vue2.5开发去哪儿网App 从零基础入门到实战项目"},{"index":1,"title":"React 16.4 开发简书项目 从零基础入门到实战"},{"index":2,"title":"Vue2.5开发去哪儿网App 从零基础入门到实战项目"},{"index":3,"title":"从基础到实战\t手把手带你掌握新版Webpack4.0"},{"index":4,"title":"Dell Lee 的微课堂,职业规划答疑解惑,精彩文章与你分享"}]}

crowller-code-express/package.json

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
{
2+
"name": "crowller-code",
3+
"version": "1.0.0",
4+
"description": "typescript crowller",
5+
"main": "index.js",
6+
"scripts": {
7+
"dev:ts": "ts-node ./src/crowller.ts",
8+
"dev:build": "tsc -w",
9+
"dev:start": "nodemon node ./build/index.js",
10+
"dev": "tsc && concurrently \"npm run dev:build\" \"npm run dev:start\""
11+
},
12+
"nodemonConfig": {
13+
"ignore": [
14+
"data/*"
15+
]
16+
},
17+
"keywords": [],
18+
"author": "",
19+
"license": "ISC",
20+
"devDependencies": {
21+
"@types/cheerio": "^0.22.18",
22+
"@types/cookie-session": "^2.0.39",
23+
"@types/express": "^4.17.6",
24+
"@types/superagent": "^4.1.7",
25+
"concurrently": "^5.2.0",
26+
"nodemon": "^2.0.3",
27+
"ts-node": "^8.9.1",
28+
"typescript": "^3.8.3"
29+
},
30+
"dependencies": {
31+
"body-parser": "^1.19.0",
32+
"cheerio": "^1.0.0-rc.3",
33+
"cookie-session": "^1.4.0",
34+
"express": "^4.17.1",
35+
"superagent": "^5.2.2"
36+
}
37+
}

crowller-code-express/readme.md

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# typescript的node爬虫
2+
3+
###### author: 果果
4+
5+
#### 构建
6+
7+
```
8+
npm i or yarn
9+
```
10+
11+
#### 运行
12+
13+
```
14+
#运行ts
15+
npm run dev:ts
16+
17+
#将ts转为js
18+
npm run dev:build
19+
20+
#运行打包之后js
21+
npm run dev:start
22+
23+
#将ts打包并自动编译[运行这个即可]
24+
npm run dev
25+
```
26+
27+
#### 结语
28+
29+
```
30+
技术交流,共同进步!
31+
```
32+

crowller-code-express/src/crowller.ts

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// ts -> .d.ts -> js
2+
// 如果我们的项目是ts文件,但是引入是的js的文件,这样就需要一个.d.ts的翻译文件 @types/superagent
3+
import path from 'path';
4+
import fs from 'fs';
5+
import superagent from 'superagent'; // 爬虫的包
6+
7+
8+
export interface Analyzer {
9+
analyze: (html: string, filePath: string) => string
10+
}
11+
12+
class Crowller {
13+
private readonly filePath = path.resolve(__dirname, '../data/course.json');
14+
15+
constructor(private url: string, private anlayzer: Analyzer) {
16+
this.initSpiderProcess()
17+
}
18+
// 爬虫入口
19+
async initSpiderProcess() {
20+
const html = await this.getRawHtml();
21+
const fileContent = this.anlayzer.analyze(html, this.filePath);
22+
fileContent && this.writeFile(fileContent)
23+
}
24+
// 获取原始html
25+
async getRawHtml() {
26+
const result = await superagent.get(this.url);
27+
return result.text;
28+
}
29+
// 写文件
30+
writeFile(content: string) {
31+
fs.writeFileSync(this.filePath, content);
32+
}
33+
}
34+
35+
export default Crowller;
36+
37+
38+
39+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
declare namespace Express {
2+
interface Request {
3+
teacherName: string
4+
}
5+
}
+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import fs from 'fs';
2+
import cheerio from 'cheerio'; // 数据截取的包
3+
import { Analyzer } from './crowller';
4+
5+
interface Course {
6+
index: number;
7+
title: string;
8+
}
9+
interface CourseResult {
10+
time: number,
11+
data: Course[]
12+
}
13+
interface Content {
14+
[propName: number]: Course[];
15+
}
16+
export default class DellAnalyzer implements Analyzer {
17+
private static instance: DellAnalyzer
18+
static getInstance() {
19+
if (!DellAnalyzer.instance) {
20+
DellAnalyzer.instance = new DellAnalyzer();
21+
}
22+
return DellAnalyzer.instance;
23+
}
24+
private constructor () {
25+
26+
}
27+
// 获取课程信息
28+
private getCourseInfo(html: string) {
29+
const $ = cheerio.load(html);
30+
const courseItems = $('.course-item');
31+
const courseInfos: Course[] = [];
32+
courseItems.map((index, element) => {
33+
const desc = $(element).find(".course-desc");
34+
const title = desc.text();
35+
courseInfos.push({ index, title })
36+
})
37+
const result = {
38+
time: new Date().getTime(),
39+
data: courseInfos
40+
}
41+
return result;
42+
}
43+
// 存储到json文件[先读取文件,如果有就添加。没有就创建]
44+
private generateJsonContent(courseInfos: CourseResult, filePath:string) {
45+
let fileContent: Content = {};
46+
if (fs.existsSync(filePath)) {
47+
const readFileContent = fs.readFileSync(filePath, 'utf-8');
48+
readFileContent && (fileContent = JSON.parse(readFileContent));
49+
}
50+
fileContent[courseInfos.time] = courseInfos.data;
51+
return fileContent;
52+
53+
}
54+
// 提供外部调用的分析接口
55+
public analyze(html: string, filePath:string) {
56+
const courseInfos = this.getCourseInfo(html);
57+
const fileContent = this.generateJsonContent(courseInfos, filePath);
58+
return JSON.stringify(fileContent);
59+
}
60+
}

crowller-code-express/src/index.ts

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import express, {Request, Response, NextFunction} from "express";
2+
import bodyParser from 'body-parser';
3+
import cookieSession from 'cookie-session';
4+
import router from './router';
5+
6+
//1。express 库的类型定义文件 .d.ts的文件类型描述不准确
7+
//2。 当使用中间件之后,对req和res 做个修改,实际上类型没有发生改变
8+
9+
const app = express();
10+
// 中间件使用
11+
app.use(bodyParser.urlencoded({ extended: false }))
12+
app.use(cookieSession({
13+
name: 'session',
14+
keys: ['teacher!lkjsldj'],
15+
// Cookie Options
16+
maxAge: 24 * 60 * 60 * 1000 // 24 hours
17+
}))
18+
// 中间件问题, 其实中间件就是一个函数
19+
// app.use((req: Request, res: Response, next: NextFunction ) => {
20+
// req.teacherName = 'guoguo';
21+
// next();
22+
// })
23+
24+
app.use(router)
25+
26+
app.listen(7001, () => {
27+
console.log("server is running at 7001");
28+
})

crowller-code-express/src/router.ts

+101
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import { Router, Request, Response } from 'express';
2+
import Crowller from './crowller';
3+
import DellAnalyzer from './dellAnalyzer';
4+
import fs from 'fs';
5+
import path from 'path';
6+
7+
const router = Router();
8+
9+
interface RequestWithBody extends Request {
10+
body: {
11+
[key: string]: string | undefined;
12+
}
13+
}
14+
15+
router.get('/', (req: Request, res: Response) => {
16+
const isLogin = req.session ? req.session.login : undefined;
17+
if (isLogin) {
18+
res.send(`
19+
<html>
20+
<body>
21+
<a href="/getData">爬取数据</a>
22+
<a href="/showData">展示数据</a>
23+
<a href="/logout">退出</a>
24+
</body>
25+
</html>
26+
`)
27+
} else {
28+
res.send(`
29+
<html>
30+
<body>
31+
<form method="post" action="/login">
32+
<input type="password" name="password"></input>
33+
<button>提交</button>
34+
</form>
35+
</body>
36+
</html>
37+
`)
38+
}
39+
40+
})
41+
42+
router.post('/login', (req: RequestWithBody, res: Response) => {
43+
const { password } = req.body;
44+
const isLogin = req.session ? req.session.login : undefined;
45+
if (isLogin) {
46+
res.send('已经登录过')
47+
} else {
48+
if (password === '123') {
49+
if (req.session) {
50+
req.session.login = true;
51+
setTimeout(() => {
52+
res.redirect('/');
53+
}, 2000)
54+
}
55+
} else {
56+
res.send('登录失败')
57+
}
58+
}
59+
})
60+
61+
router.get('/logout', (req: RequestWithBody, res: Response) => {
62+
if (req.session) {
63+
req.session.login = undefined;
64+
}
65+
res.redirect('/');
66+
// res.send("已经退出了")
67+
})
68+
69+
router.get('/getData', (req: RequestWithBody, res: Response) => {
70+
const isLogin = req.session ? req.session.login : false;
71+
if (isLogin) {
72+
// 爬虫url:http://www.dell-lee.com/typescript/demo.html?secret=secret
73+
const secret = 'secretKey';
74+
const url = `http://www.dell-lee.com/typescript/demo.html?secret=${secret}`;
75+
const anlayzer = DellAnalyzer.getInstance();
76+
new Crowller(url, anlayzer);
77+
res.send('getData success')
78+
} else {
79+
res.send('请登录后进行爬取')
80+
}
81+
})
82+
83+
router.get('/showData', (req: RequestWithBody, res: Response) => {
84+
const isLogin = req.session ? req.session.login : false;
85+
if (isLogin) {
86+
try {
87+
const position = path.resolve(__dirname, '../data/course.json');
88+
const result = fs.readFileSync(position, 'utf-8');
89+
res.json(JSON.parse(result));
90+
} catch (e) {
91+
res.send("尚未爬取到数据")
92+
}
93+
} else {
94+
res.send("用户尚未登录,请登录之后进行数据展示")
95+
}
96+
})
97+
98+
router.get('/*', (req: Request, res: Response) => {
99+
res.send('404');
100+
})
101+
export default router;

crowller-code-express/tsconfig.json

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
{
2+
"compilerOptions": {
3+
/* Basic Options */
4+
// "incremental": true, /* Enable incremental compilation */
5+
"target": "es5", /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019' or 'ESNEXT'. */
6+
"module": "commonjs", /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', or 'ESNext'. */
7+
// "lib": [], /* Specify library files to be included in the compilation. */
8+
// "allowJs": true, /* Allow javascript files to be compiled. */
9+
// "checkJs": true, /* Report errors in .js files. */
10+
// "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */
11+
// "declaration": true, /* Generates corresponding '.d.ts' file. */
12+
// "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */
13+
// "sourceMap": true, /* Generates corresponding '.map' file. */
14+
// "outFile": "./", /* Concatenate and emit output to single file. */
15+
"outDir": "./build", /* Redirect output structure to the directory. */
16+
"rootDir": "./src", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */
17+
// "composite": true, /* Enable project compilation */
18+
// "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */
19+
// "removeComments": true, /* Do not emit comments to output. */
20+
// "noEmit": true, /* Do not emit outputs. */
21+
// "importHelpers": true, /* Import emit helpers from 'tslib'. */
22+
// "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */
23+
// "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */
24+
25+
/* Strict Type-Checking Options */
26+
"strict": true, /* Enable all strict type-checking options. */
27+
// "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */
28+
// "strictNullChecks": true, /* Enable strict null checks. */
29+
// "strictFunctionTypes": true, /* Enable strict checking of function types. */
30+
// "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */
31+
// "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */
32+
// "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */
33+
// "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */
34+
35+
/* Additional Checks */
36+
// "noUnusedLocals": true, /* Report errors on unused locals. */
37+
// "noUnusedParameters": true, /* Report errors on unused parameters. */
38+
// "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */
39+
// "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */
40+
41+
/* Module Resolution Options */
42+
// "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */
43+
// "baseUrl": "./", /* Base directory to resolve non-absolute module names. */
44+
// "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */
45+
// "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */
46+
// "typeRoots": [], /* List of folders to include type definitions from. */
47+
// "types": [], /* Type declaration files to be included in compilation. */
48+
// "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */
49+
"esModuleInterop": true, /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */
50+
// "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */
51+
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
52+
53+
/* Source Map Options */
54+
// "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */
55+
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
56+
// "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */
57+
// "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */
58+
59+
/* Experimental Options */
60+
// "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */
61+
// "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */
62+
63+
/* Advanced Options */
64+
"forceConsistentCasingInFileNames": true /* Disallow inconsistently-cased references to the same file. */
65+
}
66+
}

0 commit comments

Comments
 (0)