From f84433e9891d45b466b74024d2b45821978852d7 Mon Sep 17 00:00:00 2001 From: Mischa Holz Date: Sat, 8 Jul 2023 21:57:19 +0200 Subject: [PATCH 1/2] Bump nokogiri version to make dev environment work --- rb/twitter-text.gemspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rb/twitter-text.gemspec b/rb/twitter-text.gemspec index e145b6362..905a7d978 100644 --- a/rb/twitter-text.gemspec +++ b/rb/twitter-text.gemspec @@ -19,7 +19,7 @@ Gem::Specification.new do |s| s.add_development_dependency "test-unit" s.add_development_dependency "multi_json", "~> 1.3" - s.add_development_dependency "nokogiri", "~> 1.10.9" + s.add_development_dependency "nokogiri", "~> 1.15.3" s.add_development_dependency "rake" s.add_development_dependency "rdoc" s.add_development_dependency "rspec", "~> 3.0" From 72e069d85a1adeeba76da65074900759728999af Mon Sep 17 00:00:00 2001 From: Mischa Holz Date: Sat, 8 Jul 2023 22:24:18 +0200 Subject: [PATCH 2/2] Replace idn-ruby with SimpleIDN and add more domain validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit idn-ruby is uses libidn2, which, unfortunately, does not recognize domain names that include emojis such as 🌈🌈🌈.st, even though they are valid and work in pretty much any modern browser. Additionally the JS implementation of twitter-text already does recognize links such as https://🌈🌈🌈.st as valid link entities. Replacing idn-ruby with another ruby gem that impelements the punycode conversion and adding some new validation to the is_valid_domain function allows for https://🌈🌈🌈.st to be returned as a valid link entity, without accepting any other invalid links that are tested in the conformity test suite. --- rb/lib/twitter-text/extractor.rb | 11 +++++++++-- rb/spec/test_urls.rb | 4 +++- rb/twitter-text.gemspec | 3 +-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/rb/lib/twitter-text/extractor.rb b/rb/lib/twitter-text/extractor.rb index a2fd7db7b..25734fe58 100644 --- a/rb/lib/twitter-text/extractor.rb +++ b/rb/lib/twitter-text/extractor.rb @@ -3,7 +3,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # encoding: utf-8 -require 'idn' +require 'simpleidn' class String # Helper function to count the character length by first converting to an @@ -57,6 +57,8 @@ module Extractor extend self # Maximum URL length as defined by Twitter's backend. MAX_URL_LENGTH = 4096 + MAX_DOMAIN_LABEL_LENGTH = 63 + # The maximum t.co path length that the Twitter backend supports. MAX_TCO_SLUG_LENGTH = 40 @@ -373,7 +375,12 @@ def is_valid_domain(url_length, domain, protocol) begin raise ArgumentError.new("invalid empty domain") unless domain original_domain_length = domain.length - encoded_domain = IDN::Idna.toASCII(domain) + encoded_domain = SimpleIDN.to_ascii(domain) + # If the domain starts with xn-- but is not only ASCII characters, it's invalid. + return false if domain.start_with?("xn--") && !domain.ascii_only? + labels = encoded_domain.split('.') + # If any label of the domain is longer than 63 characters, it's invalid. + return false if labels.any?{|label| label.length > MAX_DOMAIN_LABEL_LENGTH} updated_domain_length = encoded_domain.length url_length += (updated_domain_length - original_domain_length) if (updated_domain_length > original_domain_length) url_length += URL_PROTOCOL_LENGTH unless protocol diff --git a/rb/spec/test_urls.rb b/rb/spec/test_urls.rb index dc2b5c7ff..942194f3a 100644 --- a/rb/spec/test_urls.rb +++ b/rb/spec/test_urls.rb @@ -41,7 +41,9 @@ module TestUrls "http://foobar.δΈ­ε›½", "http://foobar.ΩΎΨ§Ϊ©Ψ³ΨͺΨ§Ω†", "https://www.youtube.com/playlist?list=PL0ZPu8XSRTB7wZzn0mLHMvyzVFeRxbWn-", - "http://ああ.com" + "http://ああ.com", + "twitter.θ”ι€š", + "https://🌈🌈🌈.st" ] unless defined?(TestUrls::VALID) INVALID = [ diff --git a/rb/twitter-text.gemspec b/rb/twitter-text.gemspec index 905a7d978..2eef1410c 100644 --- a/rb/twitter-text.gemspec +++ b/rb/twitter-text.gemspec @@ -25,8 +25,7 @@ Gem::Specification.new do |s| s.add_development_dependency "rspec", "~> 3.0" s.add_development_dependency "simplecov" s.add_runtime_dependency "unf", "~> 0.1.0" - # Use of idn-ruby requires libidn to be installed separately - s.add_runtime_dependency "idn-ruby" + s.add_runtime_dependency "simpleidn" s.files = `git ls-files`.split("\n") + ['lib/assets/tld_lib.yml'] + Dir['config/*'] s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")