From fb9a792cec8ab95fbb547d066f06fedf36edf23a Mon Sep 17 00:00:00 2001 From: Joe Lencioni Date: Sun, 5 Aug 2012 14:56:08 -0700 Subject: [PATCH 1/6] Improve bot detection regex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit improves the bot detection regex by:   - adding a number of missing strings, such as alexa, facebookexternalhit, feedburner, nagios, postrank, pingdom, slurp, and yahoo!   - removing redundant strings such as googlebot and robot (redundant because of the inclusion of a general "bot" string)   - consolidate similar strings ("crawler" and "crawling" became "crawl(er|ing)")   - alphabetize the list of bots --- lib/agent_orange/device.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/agent_orange/device.rb b/lib/agent_orange/device.rb index b745bb3..eb34bb4 100644 --- a/lib/agent_orange/device.rb +++ b/lib/agent_orange/device.rb @@ -14,7 +14,7 @@ class Device < Base DEVICES = { :computer => 'windows|macintosh|x11|linux', :mobile => 'ipod|ipad|iphone|palm|android|opera mini|hiptop|windows ce|smartphone|mobile|treo|psp', - :bot => 'bot|googlebot|crawler|spider|robot|crawling' + :bot => 'alexa|bot|crawl(er|ing)|facebookexternalhit|feedburner|nagios|postrank|pingdom|slurp|spider|yahoo!' } def parse(user_agent) From a507269da5f8a9043933946487223b1faebacca3 Mon Sep 17 00:00:00 2001 From: Joe Lencioni Date: Sun, 5 Aug 2012 16:09:43 -0700 Subject: [PATCH 2/6] Add "google web preview" to bot detection regex To generate previews on the fly, Google uses the user-agent "Google Web Preview" (the fully-qualified user-agent you see in your server logs may change from time to time) to render images on demand. This commit adds "google web preview" to the bot detection regex. --- lib/agent_orange/device.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/agent_orange/device.rb b/lib/agent_orange/device.rb index eb34bb4..a87b743 100644 --- a/lib/agent_orange/device.rb +++ b/lib/agent_orange/device.rb @@ -14,7 +14,7 @@ class Device < Base DEVICES = { :computer => 'windows|macintosh|x11|linux', :mobile => 'ipod|ipad|iphone|palm|android|opera mini|hiptop|windows ce|smartphone|mobile|treo|psp', - :bot => 'alexa|bot|crawl(er|ing)|facebookexternalhit|feedburner|nagios|postrank|pingdom|slurp|spider|yahoo!' + :bot => 'alexa|bot|crawl(er|ing)|facebookexternalhit|feedburner|google web preview|nagios|postrank|pingdom|slurp|spider|yahoo!' } def parse(user_agent) From 1f426c5c8b8a4f7661084c1044bbdb5c1f35163e Mon Sep 17 00:00:00 2001 From: Joe Lencioni Date: Sun, 5 Aug 2012 20:21:18 -0700 Subject: [PATCH 3/6] Add yandex to bot-detecting regex This commit adds "yandex" to the list of bots, to detect the popular Russian search engine. --- lib/agent_orange/device.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/agent_orange/device.rb b/lib/agent_orange/device.rb index a87b743..9ece03a 100644 --- a/lib/agent_orange/device.rb +++ b/lib/agent_orange/device.rb @@ -14,7 +14,7 @@ class Device < Base DEVICES = { :computer => 'windows|macintosh|x11|linux', :mobile => 'ipod|ipad|iphone|palm|android|opera mini|hiptop|windows ce|smartphone|mobile|treo|psp', - :bot => 'alexa|bot|crawl(er|ing)|facebookexternalhit|feedburner|google web preview|nagios|postrank|pingdom|slurp|spider|yahoo!' + :bot => 'alexa|bot|crawl(er|ing)|facebookexternalhit|feedburner|google web preview|nagios|postrank|pingdom|slurp|spider|yahoo!|yandex' } def parse(user_agent) From 76dc372cc00ff76939413efec5fa9b14a31c9ded Mon Sep 17 00:00:00 2001 From: Joe Lencioni Date: Sun, 5 Aug 2012 14:56:08 -0700 Subject: [PATCH 4/6] Improve bot detection regex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit improves the bot detection regex by: - adding a number of missing strings, such as alexa, facebookexternalhit, feedburner, nagios, postrank, pingdom, slurp, and yahoo! - removing redundant strings such as googlebot and robot (redundant because of the inclusion of a general "bot" string)   - consolidate similar strings ("crawler" and "crawling" became "crawl(er|ing)") - alphabetize the list of bots --- lib/agent_orange/device.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/agent_orange/device.rb b/lib/agent_orange/device.rb index b745bb3..eb34bb4 100644 --- a/lib/agent_orange/device.rb +++ b/lib/agent_orange/device.rb @@ -14,7 +14,7 @@ class Device < Base DEVICES = { :computer => 'windows|macintosh|x11|linux', :mobile => 'ipod|ipad|iphone|palm|android|opera mini|hiptop|windows ce|smartphone|mobile|treo|psp', - :bot => 'bot|googlebot|crawler|spider|robot|crawling' + :bot => 'alexa|bot|crawl(er|ing)|facebookexternalhit|feedburner|nagios|postrank|pingdom|slurp|spider|yahoo!' } def parse(user_agent) From c65c2c2bd6ce160c544d7af330ff375c590492ad Mon Sep 17 00:00:00 2001 From: Joe Lencioni Date: Sun, 5 Aug 2012 16:09:43 -0700 Subject: [PATCH 5/6] Add "google web preview" to bot detection regex To generate previews on the fly, Google uses the user-agent "Google Web Preview" (the fully-qualified user-agent you see in your server logs may change from time to time) to render images on demand. This commit adds "google web preview" to the bot detection regex. --- lib/agent_orange/device.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/agent_orange/device.rb b/lib/agent_orange/device.rb index eb34bb4..a87b743 100644 --- a/lib/agent_orange/device.rb +++ b/lib/agent_orange/device.rb @@ -14,7 +14,7 @@ class Device < Base DEVICES = { :computer => 'windows|macintosh|x11|linux', :mobile => 'ipod|ipad|iphone|palm|android|opera mini|hiptop|windows ce|smartphone|mobile|treo|psp', - :bot => 'alexa|bot|crawl(er|ing)|facebookexternalhit|feedburner|nagios|postrank|pingdom|slurp|spider|yahoo!' + :bot => 'alexa|bot|crawl(er|ing)|facebookexternalhit|feedburner|google web preview|nagios|postrank|pingdom|slurp|spider|yahoo!' } def parse(user_agent) From 34318e89ed219452ad43d42b455747df33a4364f Mon Sep 17 00:00:00 2001 From: Joe Lencioni Date: Sun, 5 Aug 2012 20:21:18 -0700 Subject: [PATCH 6/6] Add yandex to bot-detecting regex This commit adds "yandex" to the list of bots, to detect the popular Russian search engine. --- lib/agent_orange/device.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/agent_orange/device.rb b/lib/agent_orange/device.rb index a87b743..9ece03a 100644 --- a/lib/agent_orange/device.rb +++ b/lib/agent_orange/device.rb @@ -14,7 +14,7 @@ class Device < Base DEVICES = { :computer => 'windows|macintosh|x11|linux', :mobile => 'ipod|ipad|iphone|palm|android|opera mini|hiptop|windows ce|smartphone|mobile|treo|psp', - :bot => 'alexa|bot|crawl(er|ing)|facebookexternalhit|feedburner|google web preview|nagios|postrank|pingdom|slurp|spider|yahoo!' + :bot => 'alexa|bot|crawl(er|ing)|facebookexternalhit|feedburner|google web preview|nagios|postrank|pingdom|slurp|spider|yahoo!|yandex' } def parse(user_agent)