-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEmagCrawler.php
222 lines (172 loc) · 6.99 KB
/
EmagCrawler.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
<?php
/**
* This class can be used to crawl items from https://emag.ro which match the price ranges indicated by you
* You can create the links this way:
* Go to the site and select a category, you get this link:
* https://www.emag.ro/aparate-frigorifice/sd?ref=hp_menu_quick-nav_267_0&type=subdepartment
* Now delete the part that comes after the last / , like so:
* https://www.emag.ro/aparate-frigorifice/c
* NOTE: This is just and example and is made for educative purposes, it is not meant to infringe any license
*/
class EmagCrawler
{
/**
* The link must be a clean category link (ex: https://www.emag.ro/televizoare/c)
* @param string $link
* @return array|mixed|string The returned variable is an INT which represents the maximum number of pages available in the category chosen
*/
function getPages($link)
{
$html = new simple_html_dom();
$context = stream_context_create(array(
'http' => array(
'header' => array('User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),
),
));
$html->load_file($link, false, $context);
$pageMax = $html->find('.pagination .visible-xs');
$maxPageNumber = $pageMax[count($pageMax) - 1]->plaintext . PHP_EOL;
$maxPageNumber = str_replace('din ', '', $maxPageNumber);
$maxPageNumber = explode(' ', $maxPageNumber);
$maxPageNumber = max($maxPageNumber);
echo $maxPageNumber . ' Pages Found' . PHP_EOL;
return $maxPageNumber;
}
/**
* The link must be a clean category link (ex: https://www.emag.ro/televizoare/c)
* @param string $link
* @param int $maxPageNumber
* @return array returns an array of links crawled in the link provided by you
*/
function getLinks($link, $maxPageNumber)
{
$html = new simple_html_dom();
$context = stream_context_create(array(
'http' => array(
'header' => array('User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),
),
));
$linkList = [];
for ($i = 0; $i <= $maxPageNumber; $i++) {
if ($i == 1) {
$tmplink = strstr($link, '/c', true);
$tmplink = $tmplink . '/p' . $i . '/c';
}
if ($i > 1) {
$tmplink = strstr($tmplink, '/p', true);
$tmplink = $tmplink . '/p' . $i . '/c';
}
// echo 'Going through page ' . $i . PHP_EOL;
if ($i == 0) {
$html->load_file($link, false, $context);
} else {
$html->load_file($tmplink, false, $context);
}
$links = $html->find('.card .card-section-wrapper .card-section-top .card-heading a');
foreach ($links as $currentLink) {
$currentLink = $currentLink->href;
array_push($linkList, $currentLink);
}
}
echo 'Found ' . count($linkList) . ' links' . PHP_EOL;
return $linkList;
}
/**
* The link must be a clean category link (ex: https://www.emag.ro/televizoare/c)
* @param string $link
* @param int $maxPageNumber
* @return array returns a list of prices crawled from the link provided
*/
function getPrices($link, $maxPageNumber)
{
$priceList = [];
$html = new simple_html_dom();
$context = stream_context_create(array(
'http' => array(
'header' => array('User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),
),
));
for ($i = 0; $i <= $maxPageNumber; $i++) {
if ($i == 1) {
$tmplink = strstr($link, '/c', true);
$tmplink = $tmplink . '/p' . $i . '/c';
}
if ($i > 1) {
$tmplink = strstr($tmplink, '/p', true);
$tmplink = $tmplink . '/p' . $i . '/c';
}
if ($i == 0) {
$html->load_file($link, false, $context);
} else {
$html->load_file($tmplink, false, $context);
}
$prices = $html->find('div .page-container .card .product-new-price');
foreach ($prices as $price) {
$price = str_replace('.', '', $price->plaintext);
$price = str_replace(' Lei', '', $price);
$price = substr($price, 0, -3);
array_push($priceList, $price);
}
// echo 'Found ' . count($priceList) . ' products on page ' .$i. PHP_EOL;
}
echo 'Found a total of ' . count($priceList) . ' prices' . PHP_EOL;
return $priceList;
}
/**
*This function compares the prices with a minimum and maximum limit provided by you and returns viable results
* @param int $minLimit is the lowest limit
* @param int $maxLimit is the maximum limit
* @param array $priceList
* @param array $linkList
* @param string $file this parameter is optional, if you set it the results will go in a .txt .
* Be carefull tho' because the files will get re-written each time this runs because of the 'w' option.
*/
function findPricesBetween($minLimit, $maxLimit, $priceList, $linkList, $file = null)
{
$counter = 0;
$viableResults = 0;
if (isset($file)) {
$handle = fopen($file, 'w');
}
foreach ($priceList as $price) {
if ($price >= $minLimit && $price <= $maxLimit) {
echo 'Item with link ' . $linkList[$counter] . ' has price ' . $price . PHP_EOL;
$viableResults++;
if (isset($file)) {
fwrite($handle, 'Item with link ' . $linkList[$counter] . ' has price ' . $price . PHP_EOL);
} else {
continue;
}
}
$counter++;
}
echo 'Found ' . $viableResults . ' viable results!' . PHP_EOL;
if (isset($file)) {
fclose($handle);
}
}
/**
* This will return ALL of the viable category links that emag has.
* It can be used in a loop to search for things found in ALL of the categories
* @return Array $categoryLinks
*/
function getCategories()
{
$categoryLinks = [];
$html = new simple_html_dom();
$context = stream_context_create(array(
'http' => array(
'header' => array('User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),
),
));
$html->load_file('https://www.emag.ro/all-departments', false, $context);
foreach ($html->find('#department-expanded ul li a') as $category) {
if (strstr($category->href, '/c') == false) {
continue;
}
$categoryLink = 'https://www.emag.ro' . strstr($category->href, '?', true);
array_push($categoryLinks, $categoryLink);
}
return $categoryLinks;
}
}