-
Notifications
You must be signed in to change notification settings - Fork 0
/
pagerank.ml
executable file
·209 lines (178 loc) · 5.75 KB
/
pagerank.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
open Util ;;
open CrawlerServices ;;
open Order ;;
open Nodescore ;;
open Graph ;;
(* Dictionaries mapping links to their ranks. Higher is better. *)
module RankDict = Dict.Make(
struct
type key = link
type value = float
let compare = link_compare
let string_of_key = string_of_link
let string_of_value = string_of_float
let gen_key () = {host=""; port=0; path=""}
let gen_key_gt x () = gen_key ()
let gen_key_lt x () = gen_key ()
let gen_key_random () = gen_key ()
let gen_key_between x y () = None
let gen_value () = 0.0
let gen_pair () = (gen_key(),gen_value())
end)
module PageSet = Myset.Make(
struct
type t = page
let compare = (fun a b -> link_compare (a.url) (b.url))
let string_of_t = string_of_page
let gen () = {url={host=""; port=0; path=""}; links=[]; words=[]}
let gen_lt x () = gen ()
let gen_gt y () = gen ()
let gen_random () = gen ()
let gen_between x y () = None
end)
module LinkSet = Myset.Make(
struct
type t = link
let compare = link_compare
let string_of_t = string_of_link
let gen () = {host=""; port=0; path=""}
let gen_lt x () = gen ()
let gen_gt y () = gen ()
let gen_random () = gen ()
let gen_between x y () = None
end)
module PageGraph = Graph (
struct
type node = link
let compare = link_compare
let string_of_node = string_of_link
let gen () = {host=""; port=0; path=""}
end)
module PageScore = NodeScore (
struct
type node = link
let string_of_node = string_of_link
let compare = link_compare
let gen () = {host=""; port=0; path=""}
end)
(* Given a bunch of pages, convert them to a graph *)
let graph_of_pages (pages : PageSet.set) : PageGraph.graph =
(* Only want graph nodes for pages we actually crawled *)
let crawled_links =
PageSet.fold (fun page s -> LinkSet.insert page.url s)
LinkSet.empty pages
in
let add_links page graph =
let add_link g dst =
if LinkSet.member crawled_links dst then
PageGraph.add_edge g page.url dst
else g
in
List.fold_left add_link graph page.links
in
PageSet.fold add_links PageGraph.empty pages
(* The rest of the world wants a RankDict, not a NodeScore. *)
let dict_of_ns (ns : PageScore.node_score_map) : RankDict.dict =
PageScore.fold (fun node score r -> RankDict.insert r node score)
RankDict.empty ns
(* A type for modules that can compute nodescores from graphs *)
module type RANKER =
sig
module G: GRAPH
module NS: NODE_SCORE
val rank : G.graph -> NS.node_score_map
end
(* Each node's rank is equal to the number of pages that link to it. *)
module InDegreeRanker (GA: GRAPH) (NSA: NODE_SCORE with module N = GA.N) :
(RANKER with module G = GA with module NS = NSA) =
struct
module G = GA
module NS = NSA
let rank (g : G.graph) =
let add_node_edges ns node =
let neighbors = match G.neighbors g node with
| None -> []
| Some xs -> xs
in
List.fold_left (fun ns' neighbor -> NS.add_score ns' neighbor 1.0)
ns neighbors
in
let nodes = (G.nodes g) in
List.fold_left add_node_edges (NS.zero_node_score_map nodes) nodes
end
(*****************************************************************)
(* Random Walk Ranker *)
(*****************************************************************)
(*
module type WALK_PARAMS =
sig
(* Should we randomly jump somewhere else occasionally?
if no, this should be None. Else it should be the probability of
jumping on each step *)
val do_random_jumps : float option
(* How far must sisyphus walk? *)
val num_steps : int
end
module RandomWalkRanker (GA: GRAPH) (NSA: NODE_SCORE with module N = GA.N)
(P : WALK_PARAMS) :
(RANKER with module G = GA with module NS = NSA) =
struct
module G = GA
module NS = NSA
(* TODO - fill this in*)
end
*)
(******************* TESTS BELOW *******************)
module TestInDegreeRanker =
struct
module G = NamedGraph
let g = G.add_edge G.empty "a" "b";;
let g2 = G.add_edge g "a" "c";;
module NS = NodeScore (struct
type node = string
let compare = string_compare
let string_of_node = fun x -> x
let gen () = ""
end);;
module Ranker = InDegreeRanker (G) (NS);;
let ns = Ranker.rank g2;;
(* let _ = Printf.printf "NS: %s\n" (NS.string_of_node_score_map ns) ;; *)
assert ((NS.get_score ns "a") = Some 0.0);;
assert ((NS.get_score ns "b") = Some 1.0);;
assert ((NS.get_score ns "c") = Some 1.0);;
assert ((NS.get_score ns "d") = None);;
let g3 = G.add_edge g2 "b" "c";;
let ns2 = Ranker.rank g3;;
assert ((NS.get_score ns2 "a") = Some 0.0);;
assert ((NS.get_score ns2 "b") = Some 1.0);;
assert ((NS.get_score ns2 "c") = Some 2.0);;
end
(*
module TestRandomWalkRanker =
struct
module G = NamedGraph
let g = G.from_edges [("a","b") ;
("b","c") ;
("c","d") ;
("d","e") ;
("e","f") ;
("a","g") ;
("g","a")]
module NS = NodeScore (struct
type node = string
let compare = string_compare
let string_of_node = fun x -> x
let gen () = ""
end);;
module Ranker = RandomWalkRanker (G) (NS)
(struct
let do_random_jumps = None
let num_steps = 1000
end)
let ns = Ranker.rank g
(*let _ = Printf.printf "Testing RandomWalkRanker:\n NS: %s\n"
(NS.string_of_node_score_map ns)
*)
(* That's the problem with randomness -- hard to test *)
end
*)