-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsh-fetch.2
executable file
·61 lines (47 loc) · 1.78 KB
/
sh-fetch.2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
use LWP::UserAgent;
use XML::Twig;
use Digest::SHA qw(sha256_hex);
-d "data" or mkdir "data";
-d "data/sh" or mkdir "data/sh";
-d "data/sh/laws" or mkdir "data/sh/laws";
my $browser = LWP::UserAgent->new();
$browser->cookie_jar( {} );
my $searchUrl = 'http://www.gesetze-rechtsprechung.sh.juris.de/jportal/portal/t/228a/page/bsshoprod.psml/js_peid/Suchportlet1/media-type/html?formhaschangedvalue=yes&eventSubmit_doSearch=suchen&action=portlets.jw.MainAction&deletemask=no&wt_form=1&form=bsstdFastSearch&desc=all&query=SEARCHTERM&standardsuche=suchen';
my $baseUrl = 'http://www.gesetze-rechtsprechung.sh.juris.de/jportal/portal/t/228a/';
my $laws = { };
foreach my $index (glob 'data/sh/index/*.html') {
print "Parsing $index...\n";
my $shortName;
open INDEX, '<:utf8', $index or die "cannot $index: $!";
while(my $line = <INDEX>) {
if($line =~ m! *([^<]+)</strong><br />!) {
$shortName = $1;
}
if($line =~ m!<span class="unterstrichen"><a id="[^"]+" class="TrefferlisteHervorheben" title="[^"]+Aktuelle Gesamtausgabe" href="([^"]+)">!) {
print "$shortName...\n";
$laws->{$shortName} = $1;
}
}
close INDEX;
}
foreach my $law (keys %$laws) {
my $filename = 'data/sh/laws/' . sha256_hex($law) . '.html';
next if -e $filename;
my $url = $laws->{$law};
my $lawUrl = $baseUrl . $url;
$lawUrl =~ s/;jsessionid=[^?]+//;
$lawUrl =~ s/\&/\&/g;
my $lawResponse = $browser->get($lawUrl);
unless($lawResponse->is_success) {
die "Could not fetch law " . $lawResponse->status_line();
}
print $law . ": " . $lawResponse->status_line() . "\n";
my $html = $lawResponse->decoded_content();
open LAW, '>:utf8', $filename or die "cannot open $filename: $!";
print LAW $html;
close LAW;
}