-
Notifications
You must be signed in to change notification settings - Fork 0
/
rss2text.pl
executable file
·492 lines (383 loc) · 14 KB
/
rss2text.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
#!/usr/bin/env perl
use strict;
use warnings;
use 5.10.0;
use Getopt::Long qw(:config auto_help);
use HTTP::Cookies::Netscape;
use LWP::UserAgent;
use Pod::Usage;
use Try::Tiny;
use XML::FeedPP;
use POE qw(Wheel::Run Filter::Reference);
# get options passed in and get a ref to the URLs we're grabbing
my ($opts, $urls) = get_options();
# liberally applied from http://poe.perl.org/?POE_Cookbook/Child_Processes_3
POE::Session->create(
inline_states => {
_start => \&start_tasks,
next_task => \&start_tasks,
task_done => \&handle_task_done,
task_debug => sub { say STDERR $_[ARG0] } ,
task_output => sub { say $_[ARG0] },
sig_child => \&sig_child,
}
);
# Start as many tasks as needed so that the number of tasks is no more
# than the limit. Every wheel event is accompanied by the
# wheel's ID. This function saves each wheel by its ID so it can be
# referred to when its events are handled.
sub start_tasks {
my ($kernel, $heap) = @_[KERNEL, HEAP];
while (keys(%{$heap->{task}}) < $opts->{workers}) {
my $next_task = shift @$urls;
last unless defined $next_task;
my $task = POE::Wheel::Run->new(
Program => sub { process_url($next_task, $opts) },
StdoutEvent => "task_output",
StderrEvent => "task_debug",
CloseEvent => "task_done",
);
$heap->{task}->{$task->ID} = $task;
$kernel->sig_child($task->PID, "sig_child");
}
}
# The task is done. Delete the child wheel, and try to start a new
# task to take its place.
sub handle_task_done {
my ($kernel, $heap, $task_id) = @_[KERNEL, HEAP, ARG0];
delete $heap->{task}->{$task_id};
$kernel->yield("next_task");
}
# Detect the CHLD signal as each of our children exits.
sub sig_child {
my ($heap, $sig, $pid, $exit_val) = @_[HEAP, ARG0, ARG1, ARG2];
my $details = delete $heap->{$pid};
}
# Run until there are no more tasks.
$poe_kernel->run();
exit 0;
sub process_url {
my ($url, $opts) = @_;
binmode(STDOUT, ':encoding(UTF-8)');
binmode(STDERR, ':encoding(UTF-8)');
# get everything we know about this url
my $rss_cache = rss2text::cache->new($url, $opts->{cache}, $opts->{cache_dir});
if ($opts->{debug}) {
say $rss_cache->get_cache_filename();
return;
}
$rss_cache->get_cached_rss();
# get everything the internet knows about this url
my $feed = get_xml_feed($url, $rss_cache, $opts->{cookie_path});
return unless keys(%$feed);
return unless defined($feed->{feed});
return unless $feed->{feed}->get_item();
# say each link if it's new
foreach my $item ( $feed->{feed}->get_item() ) {
last if ($rss_cache->is_cached_newer(
$item->pubDate() // $item->get('pubDate') // $feed->{feed}->pubDate())
);
(my $output = $opts->{format}) =~ s/__([^\s]*?)__/parse_token($item, $1)/ge;
say $output;
}
# update the cache with information about the feed
$rss_cache->update_rss_cache($feed);
}
sub get_options {
# default settings
my %opts = (
format => '__link__',
cache => 1,
cache_dir => '/tmp/rss2text/',
cookie_path => undef,
input => undef,
debug => undef,
workers => 10,
);
GetOptions(\%opts,
'format|f:s',
'cache|c!',
'cache_dir:s',
'cookie_path:s',
'input|i:s',
'debug|d!',
'workers|w:i',
) or pod2usage(2);
my $urls;
# be like wget, accept urls through command-line or STDIN
my $input = $opts{input};
if (defined($input)) {
if ($input eq '-') {
while (defined(my $line = <STDIN>)) {
chomp $line;
next unless $line;
push @$urls, $line;
}
} else {
unless (-r $input) {
say STDERR "$input is not a readable file, bailing";
exit 1;
}
open my $fh, '<', $input or die "Unable to read $input, bailing. Reason: $!";
while (defined(my $line = <$fh>)) {
chomp $line;
next unless $line;
push @$urls, $line;
}
}
}
# they can specify a file, and just like wget, we'll still eat the urls on the commandline
push @$urls, @ARGV;
# expand newlines and tabs using cool double eval
$opts{format} =~ s/\\([nt])/"qq|\\$1|"/gee;
# add a trailing slash if they forgot
$opts{cache_dir} =~ s|([^/])$|$1/|;
# unless it's defined and readable (or the option wasn't used), bail
unless ( (defined $opts{cookie_path} && -r $opts{cookie_path}) || !defined($opts{cookie_path}) ) {
say STDERR "Cookie path $opts{cookie_path} isn't readable, bailing";
exit 1;
}
return (\%opts, $urls);
}
sub get_xml_feed {
my ($url, $rss_cache, $cookie_path) = @_;
my $ua = LWP::UserAgent->new(
agent => 'rss2text (https://github.com/Stantheman/rss2text)',
timeout => 10,
);
if ($cookie_path) {
$ua->cookie_jar(
HTTP::Cookies::Netscape->new(
file => $cookie_path
)
);
}
# add caching headers if they exist
if (length($rss_cache->{etag})) {
$ua->default_header('If-None-Match' => '"' . $rss_cache->{etag} . '"');
}
if (length($rss_cache->{last_modified})) {
$ua->default_header('If-Modified-Since' => '"' . $rss_cache->{last_modified} . '"');
}
my $rss_feed = $ua->get($url);
# nothing to do if it hasn't been modified
return if ($rss_feed->code() == 304);
if ($rss_feed->is_error()) {
say STDERR "$url returned " . $rss_feed->code() . ". Bailing";
return;
}
my $feed;
try {
$feed = XML::FeedPP->new($rss_feed->decoded_content);
} catch {
say STDERR "$url is not in valid RSS format: ", $_;
$feed = undef;
};
return {
feed => $feed,
etag => $rss_feed->header('etag') || '',
last_modified => $rss_feed->header('last-modified') || '',
};
}
sub parse_token {
my ($item, $token) = @_;
# XML::FeedPP will work harder for us if we use their convenience functions instead of get
if ($item->can($token)) {
return $item->$token() // "TAG \"$token\" UNDEFINED";
}
# otherwise, try to get
my $token_val = $item->get($token);
unless ($token_val) {
return "TAG \"$token\" UNDEFINED";
}
return $token_val;
}
### Cache class ###
package rss2text::cache;
use DateTime::Format::W3CDTF;
use Digest::MD5 'md5_hex';
use Try::Tiny;
sub new {
my ($class, $url, $cache_on, $cache_dir) = @_;
my $self->{url} = $url;
$self->{_cache_on} = $cache_on;
$self->{_cache_dir} = $cache_dir;
$self->{_cache_filename} = $self->{_cache_dir} . md5_hex($< . $self->{url});
$self->{w3c} = DateTime::Format::W3CDTF->new;
$self->{etag} = '';
$self->{last_modified} = '';
$self->{last_pulled_dt} = DateTime->from_epoch(epoch => 0);
return bless $self, $class;
}
sub get_cache_filename {
my $self = shift;
return $self->{_cache_filename};
}
sub get_cached_rss {
my $self = shift;
return unless $self->{_cache_on};
mkdir $self->{_cache_dir}, 0755 unless (-e $self->{_cache_dir});
die "Unable to make $self->{_cache_dir}: $!" unless (-e $self->{_cache_dir});
unless (-e $self->{_cache_filename}) {
say STDERR "Cache file for this feed ($self->{url}) doesn't exist.";
say STDERR "Creating a new cache file for $self->{url} and fetching from the beginning";
open my $fh, '>', $self->{_cache_filename} or die "Can't create new cache file for this RSS feed: $!";
return;
}
open my $fh, '<', $self->{_cache_filename} or die "Can't read the cached information for this RSS feed: $!";
my $last_pulled_dt = <$fh>;
unless ($last_pulled_dt) {
say STDERR "Cache file for this feed ($self->{url}) is empty, starting from 0";
return;
}
chomp($last_pulled_dt);
# parse_datetime might die
try {
$last_pulled_dt = $self->{w3c}->parse_datetime($last_pulled_dt);
} catch {
say STDERR "Cached pull date for $self->{url} isn't valid: $last_pulled_dt. Using 0";
$last_pulled_dt = $self->{last_pulled_dt};
};
$self->{last_pulled_dt} = $last_pulled_dt;
chomp($self->{etag} = <$fh>);
chomp($self->{last_modified} = <$fh>);
return $self->{last_pulled_dt};
}
sub is_cached_newer {
my ($self, $compare_dt) = @_;
return unless $self->{_cache_on};
state $already_told_you = 0;
unless ($compare_dt) {
say STDERR "Can't find a date to compare to for $self->{url}, cache is useless this run" unless ($already_told_you);
$already_told_you = 1;
return;
}
my $parsed_compare_dt;
try {
$parsed_compare_dt = $self->{w3c}->parse_datetime($compare_dt);
} catch {
unless ($already_told_you) {
say STDERR "$compare_dt isn't in W3CDTF format, can't compare to cache for $self->{url}. Assuming it's new";
say STDERR "Run rss2text without caching if this always happens with this URL ($self->{url})";
}
$already_told_you = 1;
};
return unless $parsed_compare_dt;
return (DateTime->compare($self->{last_pulled_dt}, $parsed_compare_dt) >= 0);
}
sub update_rss_cache {
my ($self, $feed) = @_;
return unless $self->{_cache_on};
my $item = $feed->{feed}->get_item(0) or do {
say STDERR "Can't get the first item from the feed ($self->{url}). Not updating the cache";
return;
};
my $new_dt = $item->pubDate() // $item->get('pubDate') // $feed->{feed}->pubDate();
unless(defined($new_dt)) {
say STDERR "Can't get the published date from the first item in the feed ($self->{url}). Not updating the cache";
return;
}
try {
$new_dt = $self->{w3c}->parse_datetime($new_dt);
} catch {
say STDERR "$new_dt isn't in W3CDTF format, not saving to cache for $self->{url}";
$new_dt = undef;
};
return unless $new_dt;
# if the last_pulled_dt < $new_dt
if ( (DateTime->compare($self->{last_pulled_dt}, $new_dt) == -1) || ($self->{last_modified} ne $feed->{last_modified}) ) {
open my $fh, '>', $self->{_cache_filename} or die "Unable to update the cache file: $!";
print $fh $self->{w3c}->format_datetime($new_dt) . "\n";
print $fh $feed->{etag} . "\n";
print $fh $feed->{last_modified} . "\n";
close $fh;
}
}
__END__
=head1 rss2text
Takes a feed and optional format string, and prints for every new entry.
=head1 USAGE
./rss2text.pl URL
./rss2text.pl --format "__title__: __link__" URL
=head1 SYNOPSIS
./rss2text [options] URL
Options:
-f, --format template string for returning results.
-[no]c, --[no]cache enables/disables cache.
-i, --input pass a file of URLs to download or "-" for STDIN
--cache_dir location of the cache directory.
--cookie_path path to a cookie to send with the request
-d, --debug print the path to the cache file
-w, --workers change number of workers
=head1 OPTIONS
=over 4
=item B<-f> I<format string>, B<--format>=I<format string>
The format string dictates how rss2text returns your data. It can include any
static text you'd like along with placeholders, which use double underscores
to separate themselves. A placeholder looks like __name__ and will be substituted
with the child element of the feed entry.
The format string can take any child elements that belong in an entry. Typical
placeholders include "title", "description", "published", "link", and "author".
Printing the title of every link is achieved by passing in the format string as
"__title__". If you want to print the title, a colon and a single
space, and then the link, simply pass "__title__: __link__".
You can request anything you'd like if you know that a feed will have the item
you're requesting. If it's not there, you'll get a big pretty message placeholder
in your output:
TAG "thing" UNDEFINED
The default value is "__link__".
=item B<-[no]c>, B<--[no]cache>
The cache option enables or disables the cache. rss2text caches the date of the
latest entry it last saw, along with any HTTP caching headers it saw (ETag and
Last-Modified values).
The default value is to cache.
=item B<-i> I<filename>, B<--input>=I<filename>
The location of a file that contains a newline-separated list of URLs to pull.
The filename can also be "-", in which case STDIN will be used to read URLs.
You don't have to pass a URL on the command line if you use this option. If you
do, those URLs will be appeneded to the list. This functionality is mimicked
from wget.
=item B<--cache_dir>
This option specifies the directory in which to store cached information. This
option does nothing if caching is disabled.
The default location for the cache is under /tmp/rss2text
=item B<--cookie_path>
Specifies the location of a cookie to be sent along with the request. The cookie
must be saved in Netscape format (or more usefully: the format that "curl"
saves cookies in.)
rss2text by default does not send any cookie along with requests.
=item B<-d>, B<--debug>
Debug will take the given URL(s) and print the location of the file on disk
containing the cached information. This option overrides the others.
=item B<-w>, B<--workers>
Change the number of workers used to grab URLs. Defaults to 10.
=back
=head1 DESCRIPTION
rss2text takes a feed and an optional format string, grabs the feed and loops
over the returned entries, printing what was requested in the format string.
It's like printf for RSS feeds and is particularly useful for one-liners and
other places where you need a textual interface.
rss2text will, by default, try to cache as much information as possible in order to
prevent displaying entries that were seen on a previous run. This makes rss2text
especially useful for cronjobs.
=head1 DEPENDENCIES
rss2text is written in perl and uses LWP::UserAgent to grab feeds, XML::FeedPP
for parsing feeds, DateTime::Format::W3CDTF to parse dates, and Try::Tiny to
make sure DateTime::Format::W3CDTF doesn't kill the program. It will make
use of HTTP::Cookies::Netscape if you ask it to send a cookie with a request.
rss2text uses POE to help grab feeds more quickly.
Debian has packages available for each:
apt-get install libwww-perl libxml-feedpp-perl libdatetime-format-w3cdtf-perl libtry-tiny-perl libhttp-cookies-perl libpoe-perl
rss2text uses perl 5.10.0. Older perls can be used, but you'll have to do the
say/print-newline dance yourself.
=head1 EXAMPLES
# print a list of new links from the feed
./rss2text.pl http://www.schwertly.com/feed/
# print a list of titles from the feed without using the cache
./rss2text.pl --nocache -f "__title__" http://www.schwertly.com/feed/
# print the title, a newline, then tab in, then the link
./rss2text.pl -f "__title__\n\t__link__" http://www.schwertly.com/feed/
# pull updates for all of your blogs with perl in the URL
grep -i perl urls.txt | ./rss2text.pl -i -
=head1 AUTHOR
Stan Schwertly (http://www.schwertly.com)