Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
SamSaffron committed May 11, 2023
0 parents commit ebf610d
Show file tree
Hide file tree
Showing 9 changed files with 294 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
config.json
tmp/
dump.db
6 changes: 6 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# frozen_string_literal: true

source "https://rubygems.org"

gem "sqlite3"
gem "mini_sql"
15 changes: 15 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
GEM
remote: https://rubygems.org/
specs:
mini_sql (1.4.0)
sqlite3 (1.6.2-x86_64-linux)

PLATFORMS
x86_64-linux

DEPENDENCIES
mini_sql
sqlite3

BUNDLED WITH
2.4.7
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
### Public Data Dump for you forum

This repo attempts to establish a pattern for a public data dump. It includes 2 data explorer queries you can use to export all your public data.

Public data is defined as forum topics and posts that anonymous users can access.

### How to use this?

First you need to define 2 queries using data explorer:

1. Topic query: [here](topic_query.sql)
2. Post query: [here](post_query.sql)

Once defined note the data explorer query ids as specified in the URL

Next, define an API key with rights to run the 2 queries.

### config.json

Create a [config.json](config.json.sample) specifying the domain of your discourse site, api key and data explorer query ids.

### Importing the site into Sqlite

The first phase of the import is importing the site into a sqlite3 db. This intermediary db stores all the content.

Run: `ruby download_topics.rb`

### Importing the Sqlite db into Discourse

1. Start with a blank DB
2. ... (in progress)
7 changes: 7 additions & 0 deletions config.json.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"api_username": "USER",
"api_key": "API_KEY",
"topics_query_id": "QUERY_ID",
"posts_query_id": "QUERY_ID",
"domain": "YOUR_DISCOURSE.com"
}
168 changes: 168 additions & 0 deletions download_topics.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
require "json"
require "net/http"
require "uri"
require "sqlite3"
require "mini_sql"
require "cgi"

begin
config = JSON.parse(File.read("config.json"))
rescue StandardError
puts "Please create a file called .creds with your API KEY and USERNAME"
end

# Replace these values with your Discourse instance details
DISCOURSE_DOMAIN = config["domain"]
API_KEY = config["api_key"]
API_USERNAME = config["api_username"]
TOPIC_QUERY_ID = config["topics_query_id"]
POST_QUERY_ID = config["posts_query_id"]

sqlite_conn = SQLite3::Database.new("dump.db")
conn = MiniSql::Connection.get(sqlite_conn)

def run_report(query_id:, min_id: 0, limit:)
params = CGI.escape({ min_id: min_id.to_s }.to_json)

uri =
URI(
"https://#{DISCOURSE_DOMAIN}/admin/plugins/explorer/queries/#{query_id}/run?limit=#{limit}&params=#{params}"
)
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true

request = Net::HTTP::Post.new(uri.request_uri)
request["Content-Type"] = "application/json"
request["Api-Key"] = API_KEY
request["Api-Username"] = API_USERNAME

response = http.request(request)
JSON.parse(response.body)
end

def create_schema(conn)
conn.exec <<-SQL
CREATE TABLE IF NOT EXISTS topics (
id INTEGER PRIMARY KEY,
category,
title,
created_at,
user_id,
tags
)
SQL

conn.exec <<-SQL
CREATE TABLE IF NOT EXISTS users(
id INTEGER PRIMARY KEY,
username,
name
)
SQL

conn.exec <<-SQL
CREATE TABLE IF NOT EXISTS posts(
id INTEGER PRIMARY KEY,
raw,
post_number,
topic_id,
user_id,
created_at
)
SQL
end

def load_posts(conn, rows)
highest_id = 0
posts_loaded = 0

conn.exec "BEGIN TRANSACTION"

rows.each do |row|
conn.exec <<~SQL, *row
INSERT OR IGNORE INTO posts (id, raw, post_number, topic_id, user_id, created_at)
VALUES (?, ?, ?, ?, ?, ?)
SQL
posts_loaded += 1
highest_id = row[0] if row[0] > highest_id
end

conn.exec "COMMIT TRANSACTION"

{ highest_id: highest_id, posts_loaded: posts_loaded }
end

def load_topics(conn, rows)
highest_id = 0
topics_loaded = 0

conn.exec "BEGIN TRANSACTION"

rows.each do |row|
conn.exec <<~SQL, *row
INSERT OR IGNORE INTO topics (id, category, title, created_at, user_id, tags)
VALUES (?, ?, ?, ?, ?, ?)
SQL
topics_loaded += 1
highest_id = row[0] if row[0] > highest_id
end

conn.exec "COMMIT TRANSACTION"

{ highest_id: highest_id, topics_loaded: topics_loaded }
end

def load_users(conn, rows)
conn.exec "BEGIN TRANSACTION"
loaded = 0

rows.each do |row|
conn.exec <<~SQL, *row
INSERT OR IGNORE INTO users(id, username, name)
VALUES (?, ?, ?)
SQL
loaded += 1
end

conn.exec "COMMIT TRANSACTION"
loaded
end

def load_users_from_json(conn, json)
users = json.dig("relations", "user")
if users
users = users.map { |user| [user["id"], user["username"], user["name"]] }
loaded = load_users(conn, users)
puts "Loaded #{loaded} users"
end
end

create_schema(conn)

min_id = 0
while true
response_data =
run_report(query_id: TOPIC_QUERY_ID, min_id: min_id, limit: 10_000)

load_users_from_json(conn, response_data)

result = load_topics(conn, response_data["rows"])
puts "Loaded #{result[:topics_loaded]} topics (highest id is #{result[:highest_id]})"

min_id = result[:highest_id]
break if result[:topics_loaded] == 0
end

min_id = 0
while true
response_data =
run_report(query_id: POST_QUERY_ID, min_id: min_id, limit: 10_000)

load_users_from_json(conn, response_data)

result = load_posts(conn, response_data["rows"])
puts "Loaded #{result[:posts_loaded]} posts (highest id is #{result[:highest_id]})"

min_id = result[:highest_id]
break if result[:posts_loaded] == 0
end
28 changes: 28 additions & 0 deletions import_db.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
require "sqlite3"
require "mini_sql"

sqlite_conn = SQLite3::Database.new("dump.db")
conn = MiniSql::Connection.get(sqlite_conn)

Dir.chdir("/home/sam/Source/discourse")
require "/home/sam/Source/discourse/config/environment"

puts "Importing users..."

created = 0
conn
.query("SELECT * FROM users")
.each do |row|
if !User.exists?(row.id)
User.create(
id: row.id,
username: row.username,
name: row.username,
password: SecureRandom.hex,
email: "#{SecureRandom.hex}@email.com"
)
end
print "."
created += 1
puts "#{created} users created" if created % 500 == 0
end
21 changes: 21 additions & 0 deletions post_query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
-- [params]
-- int :min_id = 0

SELECT
p.id,
p.raw,
p.post_number,
p.topic_id,
p.user_id,
p.created_at
FROM topics t
JOIN posts p ON p.topic_id = t.id
JOIN categories c ON c.id = t.category_id
WHERE NOT c.read_restricted
AND t.deleted_at IS NULL
AND p.deleted_at IS NULL
AND p.post_type = 1
AND NOT p.hidden
AND p.id > :min_id
ORDER BY p.id ASC

15 changes: 15 additions & 0 deletions topic_query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
-- [params]
-- int :min_id = 0

SELECT
t.id,
c.name,
t.title,
t.created_at,
t.user_id,
(SELECT STRING_AGG(tag.name, ', ') FROM topic_tags tt JOIN tags tag ON tag.id = tt.tag_id WHERE tt.topic_id = t.id) AS all_tags
FROM topics t
JOIN categories c ON c.id = t.category_id
WHERE NOT c.read_restricted AND t.deleted_at IS NULL
AND t.id > :min_id
ORDER BY t.id ASC

0 comments on commit ebf610d

Please sign in to comment.