Skip to content

Commit

Permalink
Replace hpricot with nokogiri in wordpressdotcom
Browse files Browse the repository at this point in the history
  • Loading branch information
parkr committed Jan 6, 2025
1 parent a8bdf68 commit bebb840
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 32 deletions.
1 change: 0 additions & 1 deletion jekyll-import.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ Gem::Specification.new do |s|

# importer dependencies:
# s.add_development_dependency("behance", "~> 0.3") # uses outdated dependencies
s.add_development_dependency("hpricot", "~> 0.8")
s.add_development_dependency("htmlentities", "~> 4.3")
s.add_development_dependency("mysql2", "~> 0.3")
s.add_development_dependency("open_uri_redirections", "~> 0.2")
Expand Down
59 changes: 28 additions & 31 deletions lib/jekyll-import/importers/wordpressdotcom.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def self.require_deps
rubygems
fileutils
safe_yaml
hpricot
nokogiri
time
open-uri
open_uri_redirections
Expand All @@ -22,8 +22,8 @@ def self.specify_options(c)
end

# Will modify post DOM tree
def self.download_images(title, post_hpricot, assets_folder)
images = (post_hpricot / "img")
def self.download_images(title, post_doc, assets_folder)
images = post_doc.css("img")
return if images.empty?

Jekyll.logger.info "Downloading images for ", title
Expand Down Expand Up @@ -58,11 +58,11 @@ def initialize(node)
end

def text_for(path)
@node.at(path).inner_text
@node.at(path).text
end

def title
@title ||= text_for(:title).strip
@title ||= text_for("title").strip
end

def permalink_title
Expand All @@ -76,11 +76,9 @@ def permalink_title
end

def permalink
# Hpricot thinks "link" is a self closing tag so it puts the text of the link after the tag
# but sometimes it works right! I think it's the xml declaration
@permalink ||= begin
uri = text_for("link")
uri = @node.at("link").following[0] if uri.empty?
uri = @node.at("link").next_sibling.text if uri.empty?
URI(uri.to_s).path
end
end
Expand Down Expand Up @@ -127,12 +125,8 @@ def published?

def excerpt
@excerpt ||= begin
text = Hpricot(text_for("excerpt:encoded")).inner_text
if text.empty?
nil
else
text
end
text = Nokogiri::HTML(text_for("excerpt:encoded")).text
text.empty? ? nil : text
end
end
end
Expand All @@ -144,33 +138,36 @@ def self.process(options)
FileUtils.mkdir_p(assets_folder)

import_count = Hash.new(0)
doc = Hpricot::XML(File.read(source))
doc = Nokogiri::XML(File.read(source))
# Fetch authors data from header
authors = Hash[
(doc / :channel / "wp:author").map do |author|
[author.at("wp:author_login").inner_text.strip, {
"login" => author.at("wp:author_login").inner_text.strip,
"email" => author.at("wp:author_email").inner_text,
"display_name" => author.at("wp:author_display_name").inner_text,
"first_name" => author.at("wp:author_first_name").inner_text,
"last_name" => author.at("wp:author_last_name").inner_text,
},]
doc.css("channel > wp|author").map do |author|
[
author.at("wp|author_login").text.strip,
{
"login" => author.at("wp|author_login").text.strip,
"email" => author.at("wp|author_email").text,
"display_name" => author.at("wp|author_display_name").text,
"first_name" => author.at("wp|author_first_name").text,
"last_name" => author.at("wp|author_last_name").text,
}
]
end
] rescue {}

(doc / :channel / :item).each do |node|
doc.css("channel > item").each do |node|
item = Item.new(node)
categories = node.search('category[@domain="category"]').map(&:inner_text).reject { |c| c == "Uncategorized" }.uniq
tags = node.search('category[@domain="post_tag"]').map(&:inner_text).uniq
categories = node.css('category[domain="category"]').map(&:text).reject { |c| c == "Uncategorized" }.uniq
tags = node.css('category[domain="post_tag"]').map(&:text).uniq

metas = {}
node.search("wp:postmeta").each do |meta|
key = meta.at("wp:meta_key").inner_text
value = meta.at("wp:meta_value").inner_text
node.css("wp|postmeta").each do |meta|
key = meta.at("wp|meta_key").text
value = meta.at("wp|meta_value").text
metas[key] = value
end

author_login = item.text_for("dc:creator").strip
author_login = item.text_for("dc|creator").strip

header = {
"layout" => item.post_type,
Expand All @@ -189,7 +186,7 @@ def self.process(options)
}

begin
content = Hpricot(item.text_for("content:encoded"))
content = Nokogiri::HTML(item.text_for("content:encoded"))
header["excerpt"] = item.excerpt if item.excerpt

if fetch
Expand Down

0 comments on commit bebb840

Please sign in to comment.