-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsh-fetch.1
executable file
·60 lines (46 loc) · 2.15 KB
/
sh-fetch.1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
use LWP::UserAgent;
use XML::Twig;
-d "data" or mkdir "data";
-d "data/sh" or mkdir "data/sh";
-d "data/sh/index" or mkdir "data/sh/index";
my $browser = LWP::UserAgent->new();
$browser->cookie_jar( {} );
my %searchUrls = (
'1' => 'http://www.gesetze-rechtsprechung.sh.juris.de/jportal/portal/t/228a/page/bsshoprod.psml/js_peid/Suchportlet1/media-type/html?formhaschangedvalue=yes&eventSubmit_doSearch=suchen&action=portlets.jw.MainAction&deletemask=no&wt_form=1&form=bsstdFastSearch&desc=all&query=Eingangsformel&standardsuche=suchen',
'2' => 'http://www.gesetze-rechtsprechung.sh.juris.de/jportal/portal/t/228a/page/bsshoprod.psml/js_peid/Suchportlet1/media-type/html?formhaschangedvalue=yes&eventSubmit_doSearch=suchen&action=portlets.jw.MainAction&deletemask=no&wt_form=1&form=bsstdFastSearch&desc=all&query=Inhaltsverzeichnis&standardsuche=suchen',
);
foreach my $prefix (keys %searchUrls) {
grabSearch($prefix, $searchUrls{$prefix});
}
sub grabSearch {
my ($prefix, $url) = @_;
my $currentPosition = 1;
my $contentBase = undef;
while($url) {
print "Fetching $url...\n";
my $response = $browser->get($url);
# die Dumper($response);
# die Dumper($browser->cookie_jar());
$contentBase = $response->{'_headers'}->{'content-base'};
# die $contentBase;
unless($response->is_success) {
die "Could not fetch index data: " . $response->status_line();
}
print $response->status_line() . "\n";
my $html = $response->decoded_content();
my $filename = 'data/sh/index/' . $prefix . '.' . $currentPosition . '.html';
open INDEX, '>:utf8', $filename or die "cannot open $filename: $!";
print INDEX $html;
close INDEX;
if($html =~ m!jportal/cms/technik/media/img/prodjur/icon/icon_pfeilRechts.gif!) {
$url = $contentBase . 'page/bsshoprod.psml/js_peid/Trefferliste/media-type/html?action=portlets.jw.ResultListFormAction&tl=true&eventSubmit_doSkipforward.x=8&eventSubmit_doSkipforward.y=9¤tNavigationPosition=' . $currentPosition . '&numberofresults=605&sortmethod=standard';
$currentPosition += 25;
} else {
$url = undef;
}
}
}