first commit: basic wiki conversion

trentm · Feb 6, 2011 · f51c8b2 · f51c8b2
commit f51c8b2
Show file tree

Hide file tree

Showing 2 changed files with 149 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,6 @@
+A small project with some helper scripts for moving a project of yours on
+[Google Code project hosting](http://code.google.com/hosting/) to
+[Github](https://github.com/).
+
+More details to come.
+
diff --git a/wikiconvert.py b/wikiconvert.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+
+"""
+Usage:
+    python googlecode2github/wikiconfig.py PROJID SRCDIR DSTDIR
+
+where "PROJID" is the github project id, e.g. "trentm/python-markdown2",
+"SRCDIR" is a Google Code project wiki Subversion working copy dir and
+"DSTDIR" is the git clone dir of the git project's wiki.
+"""
+
+__version__ = "1.0.0"
+
+import re
+import sys
+from os.path import *
+from glob import glob
+from pprint import pprint
+import codecs
+from hashlib import md5
+
+
+def log(s):
+    sys.stderr.write(s+"\n")
+
+def convert_dir(proj_id, src_dir, dst_dir):
+    if isfile(src_dir):
+        convert_file(proj_id, src_dir, dst_dir)
+    else:
+        for f in glob(join(src_dir, "*.wiki")):
+            convert_file(proj_id, f, dst_dir)
+
+def convert_file(proj_id, src_path, dst_dir):
+    src = codecs.open(src_path, 'r', 'utf-8').read()
+    meta_lines = []
+    body_lines = []
+    lines = src.splitlines(False)
+    for i, line in enumerate(lines):
+        if line.startswith("#"):
+            meta_lines.append(line)
+        else:
+            assert not line.strip(), "line isn't empty: %r" % line
+            body_lines = lines[i+1:]
+            break
+    meta = {}
+    for line in meta_lines:
+        k,v = line[1:].split(None, 1)
+        meta[k] = v
+    text = '\n'.join(body_lines)
+    s_from_hash = {}
+
+    # Pull out pre-blocks.
+    def sub_pre_block(match):
+        pre = match.group(1)
+        hash = md5(pre).hexdigest()
+        s_from_hash[hash] = _indent(pre)
+        return hash
+    text = re.compile(r'^{{{\n(.*?)^}}}', re.M|re.S).sub(sub_pre_block, text)
+
+    # Headings.
+    text = re.compile(r'^===(.*?)===\s*$', re.M).sub(lambda m: "### %s\n"%m.group(1).strip(), text)
+    text = re.compile(r'^==(.*?)==\s*$', re.M).sub(lambda m: "## %s\n"%m.group(1).strip(), text)
+    text = re.compile(r'^=(.*?)=\s*$', re.M).sub(lambda m: "# %s\n"%m.group(1).strip(), text)
+
+    # Tables
+    def sub_table(m):
+        rows = []
+        for line in m.group(0).splitlines(False):
+            if not line.strip():
+                continue
+            rows.append(list(c.strip() for c in line.split("||")[1:-1]))
+        lines = ['<table>']
+        for row in rows:
+            lines.append('  <tr>%s</tr>' % ''.join('<td>%s</td>' % c for c in row))
+        lines.append('</table>')
+        return '\n\n' + '\n'.join(lines)
+    text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table, text)
+
+    # Lists (don't handle nested lists).
+    text = re.compile(r'^[ \t]+\*[ \t]+(.*?)[ \t]*$', re.M).sub(r'- \1', text)
+    text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text)
+
+    # wiki links.
+    def sub_wikilink(m):
+        gh_page_name = _gh_page_name_from_gc_page_name(m.group(1)).replace('-', ' ')
+        if m.group(2):
+            s = "[[%s|%s]]" % (gh_page_name, m.group(2))
+            pass
+        else:
+            s = "[[%s]]" % gh_page_name
+        hash = md5(s).hexdigest()
+        s_from_hash[hash] = s
+        return hash
+    text = re.compile(r'\[((?:[A-Z][a-z]+)+)(?:\s+(.*?))?\]', re.S).sub(sub_wikilink, text)
+
+    # Links
+    def sub_link(m):
+        s = "[%s](%s)" % (m.group(2), m.group(1))
+        hash = md5(s).hexdigest()
+        s_from_hash[hash] = s
+        return hash
+    text = re.compile(r'(?<!\[)\[([^\s]+)\s+(.*?)\](?!\])', re.S).sub(sub_link, text)
+
+    # Italics, bold.
+    # in*ter*bold: (?<=\w)(\*\w+?\*)(?=\w)
+    text = re.compile(r'(?<![*\w])\*([^*]+?)\*(?![*\w])', re.S).sub(r'**\1**', text)
+    text = re.compile(r'(?<![_\w])_([^_]+?)_(?![_\w])', re.S).sub(r'*\1*', text)
+
+    # Auto-linking "issue \d+"
+    text = re.compile(r'(?<!\[)(issue (\d+))(?!\])').sub(
+        r'[\1](https://github.com/%s/issues#issue/\2)' % proj_id, text)
+
+    # Restore hashed-out blocks.
+    for hash, s in s_from_hash.items():
+        text = text.replace(hash, s)
+
+    # Add summary.
+    if "summary" in meta:
+        text = ("# %s\n\n" % meta["summary"]) + text
+
+    base = splitext(basename(src_path))[0]
+    gh_page_name = _gh_page_name_from_gc_page_name(base)
+    dst_path = join(dst_dir, gh_page_name+".md")
+    if not exists(dst_path) or codecs.open(dst_path, 'r', 'utf-8').read() != text:
+        codecs.open(dst_path, 'w', 'utf-8').write(text)
+        log("wrote '%s'" % dst_path)
+
+
+#---- internal support stuff
+
+def _indent(text):
+    return '    ' + '\n    '.join(text.splitlines(False))
+
+def _gh_page_name_from_gc_page_name(gc):
+    """Github (gh) Wiki page name from Google Code (gc) Wiki page name."""
+    gh = re.sub(r'([A-Z][a-z]+)', r'-\1', gc)[1:]
+    return gh
+
+
+#---- mainline
+
+if __name__ == '__main__':
+    convert_dir(sys.argv[1], sys.argv[2], sys.argv[3])