Skip to content

Commit 0250ff8

Browse files
committed
Merge branch 'next'
* next: Factor out the interactions handler in the worker code. Document the new connection configuration option. [TAV-522] Configure number of network error retries. [TAV-522] Handle network level errors (retry). Fix the "fail in post run callback" worker test. Factor out run creation in the worker. Use the latest t2-server client library.
2 parents 633b9e8 + d64eb97 commit 0250ff8

File tree

8 files changed

+195
-78
lines changed

8 files changed

+195
-78
lines changed

README.rdoc

+13-1
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,19 @@ that make use of interactions (so you have users watching the workflows
313313
running) then it should probably be set lower.
314314

315315
There are a number of options for configuring the connection to the Taverna
316-
Server. These are actually provided by the underlying
316+
Server. The first specifies how many times an initial connection to Taverna
317+
Server will be retried if there are any low level network errors.
318+
319+
config.server_connection_error_retries = 5
320+
321+
These can be events such as timeouts, broken connections, refused connections
322+
or anything else out of our control. If such an error is detected Taverna
323+
Player will back off for the amount of time specified by
324+
<tt>config.server_retry_interval</tt> before retrying. If the maximum number of
325+
retries is reached and there is still an error then the run will fail; the
326+
network error message will be recorded as the reason for the run's failure.
327+
328+
The rest of the connection options are actually provided by the underlying
317329
{t2-server client library}[https://rubygems.org/gems/t2-server] and surfaced
318330
here for extra control. They are
319331
{documented in more detail}[http://mygrid.github.io/t2-server-gem/] elsewhere

config/locales/en.yml

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ en:
1717
pre-callback: "Running pre-run tasks"
1818
connect: "Connecting to Taverna Server"
1919
full: "Server full - please wait; run will start soon"
20+
network-error: "Network error - please wait; retrying"
2021
initialized: "Initializing new workflow run"
2122
inputs: "Uploading run inputs"
2223
start: "Starting run"

lib/generators/templates/server_initializer.rb

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
config.server_retry_interval = 10
1414

1515
# Taverna Server connection configuration.
16+
#config.server_connection_error_retries = 5
1617
#config.server_connection[:verify_peer] = true
1718
#config.server_connection[:ca_file] = "/etc/certs/my-cert.crt"
1819
#config.server_connection[:ca_path] = "/etc/certs/"

lib/taverna-player.rb

+4
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ def self.port_renderers
121121
mattr_accessor :server_retry_interval
122122
@@server_retry_interval = 10
123123

124+
# Number of times to retry on a low-level connection error.
125+
mattr_accessor :server_connection_error_retries
126+
@@server_connection_error_retries = 5
127+
124128
# Taverna Server connection parameters
125129
mattr_accessor :server_connection
126130
@@server_connection = T2Server::DefaultConnectionParameters.new

lib/taverna_player/worker.rb

+107-75
Original file line numberDiff line numberDiff line change
@@ -50,23 +50,11 @@ def perform
5050

5151
begin
5252
server = T2Server::Server.new(@server, conn_params)
53-
wkf = File.read(@workflow)
53+
workflow = File.read(@workflow)
54+
run = create_run(server, workflow, credentials)
5455

55-
# Try and create the run bearing in mind that the server might be at
56-
# the limit of runs that it can hold at once.
57-
begin
58-
run = server.create_run(wkf, credentials)
59-
rescue T2Server::ServerAtCapacityError
60-
status_message("full")
61-
62-
if cancelled?
63-
cancel
64-
return
65-
end
66-
67-
sleep(TavernaPlayer.server_retry_interval)
68-
retry
69-
end
56+
# If run is nil here then we could have failed or have been cancelled.
57+
return if run.nil?
7058

7159
status_message("initialized")
7260

@@ -121,65 +109,7 @@ def perform
121109
return
122110
end
123111

124-
run.notifications(:requests).each do |note|
125-
if @run.has_parent?
126-
next if note.has_reply? || note.is_notification?
127-
int = Interaction.find_by_run_id_and_serial(@run.parent_id, note.serial)
128-
new_int = Interaction.find_or_initialize_by_run_id_and_serial(@run.id, note.serial)
129-
if new_int.new_record?
130-
note.reply(int.feed_reply, int.data)
131-
new_int.displayed = true
132-
new_int.replied = true
133-
new_int.feed_reply = int.feed_reply
134-
new_int.data = int.data
135-
new_int.save
136-
end
137-
else
138-
int = Interaction.find_or_initialize_by_run_id_and_serial(@run.id, note.serial)
139-
140-
# Need to catch this here in case some other process has replied.
141-
if note.has_reply? && !int.replied?
142-
int.replied = true
143-
int.save
144-
end
145-
146-
unless int.replied?
147-
if int.page.blank?
148-
page = server.read(note.uri, "text/html", credentials)
149-
150-
INTERACTION_REGEX.match(page) do
151-
page_uri = $1
152-
153-
if page_uri.starts_with?(server.uri.to_s)
154-
page = server.read(URI.parse(page_uri), "text/html", credentials)
155-
int.page = page.gsub("#{run.interactions_uri.to_s}/pmrpc.js",
156-
"/assets/taverna_player/application.js")
157-
else
158-
int.page_uri = page_uri
159-
end
160-
end
161-
end
162-
163-
if note.is_notification? && !int.new_record?
164-
int.replied = true
165-
end
166-
167-
if int.data.blank?
168-
int.data = note.input_data.force_encoding("UTF-8")
169-
end
170-
171-
if !int.feed_reply.blank? && !int.data.blank?
172-
note.reply(int.feed_reply, int.data)
173-
174-
int.replied = true
175-
end
176-
177-
int.save
178-
end
179-
180-
waiting = true unless int.replied?
181-
end
182-
end
112+
waiting = interactions(run, credentials)
183113

184114
status_message(waiting ? "interact" : "running")
185115
end
@@ -220,6 +150,108 @@ def server_credentials
220150
T2Server::HttpBasic.new(user, pass)
221151
end
222152

153+
# Try and create the run bearing in mind that the server might be at
154+
# the limit of runs that it can hold at once.
155+
def create_run(server, workflow, credentials)
156+
retries ||= TavernaPlayer.server_connection_error_retries
157+
server.create_run(workflow, credentials)
158+
rescue T2Server::ServerAtCapacityError
159+
status_message("full")
160+
161+
if cancelled?
162+
cancel
163+
return
164+
end
165+
166+
sleep(TavernaPlayer.server_retry_interval)
167+
retry
168+
rescue T2Server::ConnectionError => ce
169+
status_message("network-error")
170+
171+
if cancelled?
172+
cancel
173+
return
174+
end
175+
176+
sleep(TavernaPlayer.server_retry_interval)
177+
unless retries.zero?
178+
retries -= 1
179+
retry
180+
end
181+
182+
# If we're out of retries, fail the run.
183+
failed(ce)
184+
end
185+
186+
def interactions(run, credentials)
187+
wait = false
188+
189+
run.notifications(:requests).each do |note|
190+
if @run.has_parent?
191+
next if note.has_reply? || note.is_notification?
192+
193+
int = Interaction.find_by_run_id_and_serial(@run.parent_id, note.serial)
194+
new_int = Interaction.find_or_initialize_by_run_id_and_serial(@run.id, note.serial)
195+
196+
if new_int.new_record?
197+
note.reply(int.feed_reply, int.data)
198+
new_int.displayed = true
199+
new_int.replied = true
200+
new_int.feed_reply = int.feed_reply
201+
new_int.data = int.data
202+
new_int.save
203+
end
204+
else
205+
int = Interaction.find_or_initialize_by_run_id_and_serial(@run.id, note.serial)
206+
207+
# Need to catch this here in case some other process has replied.
208+
if note.has_reply? && !int.replied?
209+
int.replied = true
210+
int.save
211+
end
212+
213+
unless int.replied?
214+
if int.page.blank?
215+
server = run.server
216+
page = server.read(note.uri, "text/html", credentials)
217+
218+
INTERACTION_REGEX.match(page) do
219+
page_uri = $1
220+
221+
if page_uri.starts_with?(server.uri.to_s)
222+
page = server.read(URI.parse(page_uri), "text/html", credentials)
223+
int.page = page.gsub("#{run.interactions_uri.to_s}/pmrpc.js",
224+
"/assets/taverna_player/application.js")
225+
else
226+
int.page_uri = page_uri
227+
end
228+
end
229+
end
230+
231+
# If this is a pure notification that we've already seen then
232+
# set it as replied as we don't want it blocking a proper
233+
# interaction.
234+
int.replied = true if note.is_notification? && !int.new_record?
235+
236+
if int.data.blank?
237+
int.data = note.input_data.force_encoding("UTF-8")
238+
end
239+
240+
if !int.feed_reply.blank? && !int.data.blank?
241+
note.reply(int.feed_reply, int.data)
242+
int.replied = true
243+
end
244+
245+
int.save
246+
end
247+
248+
wait = true unless int.replied?
249+
end
250+
end
251+
252+
wait
253+
end
254+
223255
# Run the specified callback and return false on error so that we know to
224256
# return out of the worker code completely.
225257
def run_callback(cb, message)

taverna_player.gemspec

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ Gem::Specification.new do |s|
3737
s.add_dependency "jquery-rails", "~> 3.0"
3838
s.add_dependency "paperclip", "~> 4.1"
3939
s.add_dependency "taverna-t2flow", "~> 0.5.1"
40-
s.add_dependency "t2-server", "~> 1.1"
40+
s.add_dependency "t2-server", "~> 1.2"
4141
s.add_dependency "delayed_job_active_record", "~> 4.0"
4242
s.add_dependency "daemons", "~> 1.1.9"
4343
s.add_dependency "rubyzip", "~> 1.1.4"

test/dummy/config/locales/taverna_player.en.yml

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ en:
1717
pre-callback: "Running pre-run tasks"
1818
connect: "Connecting to Taverna Server"
1919
full: "Server full - please wait; run will start soon"
20+
network-error: "Network error - please wait; retrying"
2021
initialized: "Initializing new workflow run"
2122
inputs: "Uploading run inputs"
2223
start: "Starting run"

test/unit/taverna_player/worker_test.rb

+67-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class WorkerTest < ActiveSupport::TestCase
2828
config.server_password = "taverna"
2929
config.server_poll_interval = 0
3030
config.server_retry_interval = 0
31+
config.server_connection_error_retries = 2
3132
config.pre_run_callback = @noop_callback
3233
config.post_run_callback = @noop_callback
3334
config.run_cancelled_callback = @noop_callback
@@ -38,6 +39,7 @@ class WorkerTest < ActiveSupport::TestCase
3839

3940
# Stuff we can't test yet in TavernaPlayer::Worker.
4041
flexmock(TavernaPlayer::Worker).new_instances do |w|
42+
w.should_receive(:interactions).and_return(false)
4143
w.should_receive(:download_outputs).and_return_undefined
4244
w.should_receive(:process_outputs).and_return([])
4345
end
@@ -101,7 +103,6 @@ class WorkerTest < ActiveSupport::TestCase
101103
r.should_receive(:name=).once.and_return(true)
102104
r.should_receive(:start).twice.and_return(false, true)
103105
r.should_receive(:start_time).and_return(Time.now)
104-
r.should_receive(:notifications).and_return([])
105106
r.should_receive(:finish_time).and_return(Time.now)
106107
r.should_receive(:log).once.and_return(0)
107108
r.should_receive(:delete).and_return_undefined
@@ -281,6 +282,25 @@ class WorkerTest < ActiveSupport::TestCase
281282
# Set a failing post_run callback
282283
TavernaPlayer.post_run_callback = Proc.new { raise RuntimeError }
283284

285+
# Stub the creation of a run on a Taverna Server.
286+
flexmock(T2Server::Server).new_instances do |s|
287+
s.should_receive(:initialize_run).once.
288+
and_return(URI.parse("http://localhost/run/01"))
289+
end
290+
291+
# Stub the Taverna Server run calls.
292+
flexmock(T2Server::Run).new_instances do |r|
293+
r.should_receive(:status).times(3).and_return(:initialized, :running, :finished)
294+
r.should_receive(:create_time).and_return(Time.now)
295+
r.should_receive(:add_password_credential).and_return(true)
296+
r.should_receive(:name=).once.and_return(true)
297+
r.should_receive(:start).once.and_return(true)
298+
r.should_receive(:start_time).and_return(Time.now)
299+
r.should_receive(:finish_time).and_return(Time.now)
300+
r.should_receive(:log).once.and_return(0)
301+
r.should_receive(:delete).and_return_undefined
302+
end
303+
284304
assert_equal :pending, @run.state, "Initial run state not ':pending'"
285305

286306
@worker.perform
@@ -302,4 +322,50 @@ class WorkerTest < ActiveSupport::TestCase
302322

303323
assert_equal :timeout, @run.state, "Final run state not ':timeout'"
304324
end
325+
326+
test "network error with recovery" do
327+
# Stub the creation of a run on a Taverna Server with a network error
328+
# first.
329+
flexmock(T2Server::Server).new_instances do |s|
330+
s.should_receive(:initialize_run).twice.
331+
and_raise(T2Server::ConnectionError, Timeout::Error.new).
332+
and_return(URI.parse("http://localhost/run/01"))
333+
end
334+
335+
# Stub the Taverna Server run calls.
336+
flexmock(T2Server::Run).new_instances do |r|
337+
r.should_receive(:status).times(3).and_return(:initialized, :running, :finished)
338+
r.should_receive(:create_time).and_return(Time.now)
339+
r.should_receive(:add_password_credential).and_return(true)
340+
r.should_receive(:name=).once.and_return(true)
341+
r.should_receive(:start).once.and_return(true)
342+
r.should_receive(:start_time).and_return(Time.now)
343+
r.should_receive(:finish_time).and_return(Time.now)
344+
r.should_receive(:log).once.and_return(0)
345+
r.should_receive(:delete).and_return_undefined
346+
end
347+
348+
assert_equal :pending, @run.state, "Initial run state not ':pending'"
349+
350+
@worker.perform
351+
352+
assert_equal :finished, @run.state, "Final run state not ':finished'"
353+
end
354+
355+
test "network error with no recovery" do
356+
# Stub the creation of a run on a Taverna Server with a network error
357+
# first.
358+
flexmock(T2Server::Server).new_instances do |s|
359+
# Connection retries are set to 2 so this should be called 3 times.
360+
s.should_receive(:initialize_run).times(3).
361+
and_raise(T2Server::ConnectionError, Timeout::Error.new)
362+
end
363+
364+
assert_equal :pending, @run.state, "Initial run state not ':pending'"
365+
366+
@worker.perform
367+
368+
assert_equal :failed, @run.state, "Final run state not ':finished'"
369+
end
370+
305371
end

0 commit comments

Comments
 (0)