From 6b45c6d9c4fa5833caba1574fdca52c574fca378 Mon Sep 17 00:00:00 2001 From: Zhongpeng Lin Date: Mon, 10 Jun 2013 22:30:06 -0700 Subject: [PATCH 1/2] Add Content extension --- pycvsanaly2/DBContentHandler.py | 2 +- pycvsanaly2/Database.py | 4 +- pycvsanaly2/GitParser.py | 2 +- pycvsanaly2/extensions/Content.py | 444 ++++++++++++++++++++++++++++++ 4 files changed, 448 insertions(+), 4 deletions(-) create mode 100644 pycvsanaly2/extensions/Content.py diff --git a/pycvsanaly2/DBContentHandler.py b/pycvsanaly2/DBContentHandler.py index 9a31a3e..c928731 100644 --- a/pycvsanaly2/DBContentHandler.py +++ b/pycvsanaly2/DBContentHandler.py @@ -609,7 +609,7 @@ def commit(self, commit): if action.type == 'A': # A file has been added file_id = self.__action_add(path, prefix, log) - elif action.type == 'M': + elif action.type.find('M') >= 0: # A file has been modified file_id = self.__get_file_for_path(path, log.id)[0] elif action.type == 'D': diff --git a/pycvsanaly2/Database.py b/pycvsanaly2/Database.py index 46b1903..a138419 100644 --- a/pycvsanaly2/Database.py +++ b/pycvsanaly2/Database.py @@ -407,7 +407,7 @@ def create_tables(self, cursor): ")") cursor.execute("CREATE TABLE actions (" + "id integer primary key," + - "type varchar(1)," + + "type varchar(2)," + "file_id integer," + "commit_id integer," + "branch_id integer" + @@ -556,7 +556,7 @@ def create_tables(self, cursor): " CHARACTER SET=utf8") cursor.execute("CREATE TABLE actions (" + "id INT," + - "type varchar(1)," + + "type varchar(2)," + "file_id integer," + "commit_id integer," + "branch_id integer," + diff --git a/pycvsanaly2/GitParser.py b/pycvsanaly2/GitParser.py index cc235e1..d282527 100644 --- a/pycvsanaly2/GitParser.py +++ b/pycvsanaly2/GitParser.py @@ -64,7 +64,7 @@ def set_tail(self, tail): patterns['committer'] = re.compile("^Commit:[ \t]+(.*)[ \t]+<(.*)>$") patterns['date'] = re.compile( "^CommitDate: (.* [0-9]+ [0-9]+:[0-9]+:[0-9]+ [0-9][0-9][0-9][0-9]) ([+-][0-9][0-9][0-9][0-9])$") - patterns['file'] = re.compile("^([MAD])[ \t]+(.*)$") + patterns['file'] = re.compile("^([MAD]+)[ \t]+(.*)$") patterns['file-moved'] = re.compile("^([RC])[0-9]+[ \t]+(.*)[ \t]+(.*)$") patterns['branch'] = re.compile("refs/remotes/origin/([^,]*)") patterns['local-branch'] = re.compile("refs/heads/([^,]*)") diff --git a/pycvsanaly2/extensions/Content.py b/pycvsanaly2/extensions/Content.py new file mode 100644 index 0000000..70d169f --- /dev/null +++ b/pycvsanaly2/extensions/Content.py @@ -0,0 +1,444 @@ +# Copyright(C) 2010 University of California, Santa Cruz +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +#(at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Authors : +# Chris Lewis +# Zhongpeng Lin + +from pycvsanaly2.extensions import Extension, register_extension, \ + ExtensionRunError +from pycvsanaly2.Database import SqliteDatabase, MysqlDatabase, statement +from pycvsanaly2.utils import printdbg, printerr, uri_to_filename, to_utf8 +from pycvsanaly2.profile import profiler_start, profiler_stop +from FileRevs import FileRevs +from repositoryhandler.backends import RepositoryCommandError +from repositoryhandler.backends.watchers import CAT, SIZE +from Jobs import JobPool, Job +from io import BytesIO +import os +import traceback + + +# This class holds a single repository retrieve task, +# and keeps the source code until the object is garbage-collected +class ContentJob(Job): + def __init__(self, commit_id, file_id, rev, path): + self.commit_id = commit_id + self.file_id = file_id + self.rev = rev + self.path = path + self._file_contents = "" + self.file_size = None + + def run(self, repo, repo_uri): + self.repo = repo + self.repo_uri = repo_uri + self.repo_type = self.repo.get_type() + + if self.repo_type == 'cvs': + # CVS self.paths contain the module stuff + uri = self.repo.get_uri_for_self.path(self.repo_uri) + module = uri[len(self.repo.get_uri()):].strip('/') + + if module != '.': + self.path = self.path[len(module):].strip('/') + else: + self.path = self.path.strip('/') + else: + self.path = self.path.strip('/') + + suffix = '' + filename = os.path.basename(self.path) + ext_ptr = filename.rfind('.') + if ext_ptr != -1: + suffix = filename[ext_ptr:] + + self._file_contents = self.listen_for_data(self.repo.cat, CAT) + + try: + self.file_size = self.listen_for_data(self.repo.size, SIZE) + except NotImplementedError: + self.file_size = None + + if self.file_size: + self.file_size = int(self.file_size) + + def listen_for_data(self, repo_func, watcher): + def write_line(data, io): + io.write(data) + + io = BytesIO() + + wid = self.repo.add_watch(watcher, write_line, io) + + # Git doesn't need retries because all of the revisions + # are already on disk + if self.repo_type == 'git': + retries = 0 + else: + retries = 3 + + done = False + failed = False + # Try downloading the file revision + while not done and not failed: + try: + repo_func(os.path.join(self.repo_uri, self.path), self.rev) + done = True + except RepositoryCommandError, e: + if retries > 0: + printerr("Command %s returned %d(%s), try again",\ + (e.cmd, e.returncode, e.error)) + retries -= 1 + io.seek(0) + elif retries == 0: + failed = True + printerr("Error obtaining %s@%s. " + + "Command %s returned %d(%s)", \ + (self.path, self.rev, e.cmd, \ + e.returncode, e.error)) + except: + failed = True + printerr("Error obtaining %s@%s.", \ + (self.path, self.rev)) + traceback.print_exc() + + self.repo.remove_watch(watcher, wid) + + results = None + if not failed: + try: + results = io.getvalue() + except Exception, e: + printerr("Error getting contents." + + "Exception: %s", (str(e),)) + finally: + io.close() + return results + + def _get_file_contents(self): + """Returns contents of the file, stripped of whitespace + at either end + """ + # An encode will fail if the source code can't be converted to + # utf-8, ie. it's not already unicode, or latin-1, or something + # obvious. This almost always means that the file isn't source + # code at all. + # TODO: I should really throw a "not source" exception, + # but just doing None is fine for now. + try: + return to_utf8(self._file_contents).encode("utf-8") + except: + return None + + def _set_file_contents(self, contents): + self._file_contents = contents + + def _get_number_of_lines(self): + """Return the number of lines contained within the file, stripped + of whitespace at either end. + + # Note that it looks like doctest doesn't work with properties, + # depending on what your doctest runner is. That's why + # it accesses the setter. There's no need to do this in your code. + >>> cj = ContentJob(None, None, None, None) + >>> cj._set_file_contents("Hello") + >>> cj.file_number_of_lines + 1 + >>> cj._set_file_contents("Hello \\n world") + >>> cj.file_number_of_lines + 2 + >>> cj._set_file_contents("") + >>> cj.file_number_of_lines + 0 + >>> cj._set_file_contents(None) + >>> cj.file_number_of_lines + + >>> cj._set_file_contents("\\n\\n Hello \\n\\n") + >>> cj.file_number_of_lines + 1 + + >>> cj._set_file_contents("a\\nb") + >>> cj.file_number_of_lines + 2 + + >>> cj._set_file_contents("a\\nb\\nc\\nd\\nea\\nb\\nc\\nd\\ne") + >>> cj.file_number_of_lines + 9 + """ + + # Access the internal variable to try and get a count even if + # Unicode conversion fails + + try: + contents = self._file_contents.strip() + except (UnicodeEncodeError, UnicodeDecodeError, AttributeError): + return None + + return len(contents.splitlines()) + + file_number_of_lines = property(_get_number_of_lines) + file_contents = property(_get_file_contents, _set_file_contents) + + +class Content(Extension): + deps = ['FileTypes'] + + MAX_THREADS = 10 + + def __prepare_table(self, connection, drop_table=False): + # Drop the table's old data + if drop_table: + cursor = connection.cursor() + + try: + cursor.execute("DROP TABLE content") + except Exception, e: + printerr("Couldn't drop content table because %s", (e,)) + finally: + cursor.close() + + if isinstance(self.db, SqliteDatabase): + from sqlite3 import OperationalError + cursor = connection.cursor() + + # Note that we can't guarentee sqlite is going + # to provide foreign key support (it was only + # introduced in 3.6.19), so no constraints are set + try: + cursor.execute("""CREATE TABLE content( + id INTEGER PRIMARY KEY, + commit_id INTEGER NOT NULL, + file_id INTEGER NOT NULL, + content CLOB, + loc INTEGER, + size INTEGER, + UNIQUE (commit_id, file_id))""") + cursor.execute("""create index commit_id_index + on content(commit_id)""") + cursor.execute("""create index commit_id_index + on content(file_id)""") + except OperationalError: + # It's OK if the table already exists + pass + except: + raise + finally: + cursor.close() + + elif isinstance(self.db, MysqlDatabase): + from MySQLdb import OperationalError + + cursor = connection.cursor() + + # I removed foreign key constraints because + # cvsanaly uses MyISAM, which doesn't enforce them. + # MySQL was giving errno:150 when trying to create with + # them anyway + try: + cursor.execute("""CREATE TABLE content( + id int(11) NOT NULL auto_increment, + commit_id int(11) NOT NULL, + file_id int(11) NOT NULL, + content mediumtext, + loc int(11), + size int(11), + PRIMARY KEY(id), + UNIQUE (commit_id, file_id), + index(commit_id), + index(file_id) + ) ENGINE=InnoDB CHARACTER SET=utf8""") + + except OperationalError as e: + if e.args[0] == 1050: + # It's OK if the table already exists + pass + else: + raise + except: + raise + finally: + cursor.close() + + connection.commit() + + def __process_finished_jobs(self, job_pool, connection, db): + if isinstance(self.db, SqliteDatabase): + from sqlite3 import IntegrityError + elif isinstance(self.db, MysqlDatabase): + from MySQLdb import IntegrityError + write_cursor = connection.cursor() + finished_job = job_pool.get_next_done(0) + processed_jobs = 0 + # commit_id is the commit ID. For some reason, the + # documentation advocates tablename_id as the reference, + # but in the source, these are referred to as commit IDs. + # Don't ask me why! + while finished_job is not None: + file_contents = str(finished_job.file_contents) + + query = """ + insert into content(commit_id, file_id, content, loc, size) + values(?,?,?,?,?)""" + insert_statement = statement(query, db.place_holder) + parameters = (finished_job.commit_id, + finished_job.file_id, + file_contents, + finished_job.file_number_of_lines, + finished_job.file_size) + try: + write_cursor.execute(insert_statement, parameters) + except IntegrityError as e: + if isinstance(self.db, MysqlDatabase) and e.args[0] == 1062: + # Ignore duplicate entry + pass + else: + printerr('Error while inserting content for file %d @ commit %d' % \ + (finished_job.file_id, finished_job.commit_id)) + raise + + processed_jobs += 1 + finished_job = job_pool.get_next_done(0) + + connection.commit() + write_cursor.close() + + return processed_jobs + + def run(self, repo, uri, db): + # Start the profiler, per every other extension + profiler_start("Running content extension") + + # Open a connection to the database and get cursors + self.db = db + connection = self.db.connect() + read_cursor = connection.cursor() + + # Try to get the repository and get its ID from the database + try: + path = uri_to_filename(uri) + if path is not None: + repo_uri = repo.get_uri_for_path(path) + else: + repo_uri = uri + + read_cursor.execute(statement( \ + "SELECT id from repositories where uri = ?", \ + db.place_holder), (repo_uri,)) + repo_id = read_cursor.fetchone()[0] + except NotImplementedError: + raise ExtensionRunError( \ + "Content extension is not supported for %s repos" % \ + (repo.get_type())) + except Exception, e: + raise ExtensionRunError( \ + "Error creating repository %s. Exception: %s" % \ + (repo.get_uri(), str(e))) + + # Try to create a table for storing the content + # TODO: Removed use case for choosing between all or just the HEAD, + # should ideally put that back again. Just all for now is fine. + try: + self.__prepare_table(connection) + except Exception as e: + raise ExtensionRunError("Couldn't prepare table because " + \ + str(e)) + + queuesize = self.MAX_THREADS + printdbg("Setting queuesize to " + str(queuesize)) + + # This is where the threading stuff comes in, I expect + job_pool = JobPool(repo, path or repo.get_uri(), queuesize=queuesize) + + # This filters files if they're not source files. + # I'm pretty sure "unknown" is returning binary files too, but + # these are implicitly left out when trying to convert to utf-8 + # after download. However, ignore them for now to speed things up + query = "select f.id from file_types ft, files f " + \ + "where f.id = ft.file_id and " + \ + "ft.type in('code') and " + \ + "f.repository_id = ?" + # "ft.type in('code', 'unknown') and " + \ + read_cursor.execute(statement(query, db.place_holder), (repo_id,)) + code_files = [item[0] for item in read_cursor.fetchall()] + query = """select c.file_id, c.commit_id from content c, files f + where c.file_id=f.id and f.repository_id=? + """ + read_cursor.execute(statement(query, db.place_holder), (repo_id,)) + existing_content = [(item[0], item[1]) \ + for item in read_cursor.fetchall()] + + fr = FileRevs(db, connection, read_cursor, repo_id) + + i = 0 + # Loop through each file and its revision + for revision, commit_id, file_id, action_type, composed in fr: + if action_type == 'D': + continue +# loop_start = datetime.now() + if file_id not in code_files: + continue + if (file_id, commit_id) in existing_content: + continue + + try: + relative_path = fr.get_path(repo, path or repo.get_uri()) + except TypeError as e: + printerr("No path found for file %d at commit %d", + (file_id, commit_id)) + continue + if composed: + rev = revision.split("|")[0] + else: + rev = revision + + printdbg("Path for %d at %s -> %s", (file_id, rev, relative_path)) + + # Ignore SVN tags + if repo.get_type() == 'svn' and relative_path == 'tags': + printdbg("Skipping file %s", (relative_path,)) + continue + + job = ContentJob(commit_id, file_id, rev, relative_path) + job_pool.push(job) + i = i + 1 + if i >= queuesize: + printdbg("Content queue is now at %d, flushing to database", + (i,)) + + processed_jobs = self.__process_finished_jobs(job_pool, + connection, db) + i = i - processed_jobs + if processed_jobs < (queuesize / 5): + job_pool.join() + + job_pool.join() + self.__process_finished_jobs(job_pool, connection, db) + + read_cursor.close() + connection.close() + + # This turns off the profiler and deletes it's timings + profiler_stop("Running content extension", delete=True) + + def backout(self, repo, uri, db): + update_statement = """delete from content where + commit_id in (select id from scmlog s + where s.repository_id = ?)""" + + self._do_backout(repo, uri, db, update_statement) + +register_extension("Content", Content) From af146b56a146ce4f44e05058f71e27085e6d6abd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zhongpeng=20Lin=20=28=E6=9E=97=E4=B8=AD=E9=B9=8F=29?= Date: Wed, 26 Mar 2014 10:08:12 -0700 Subject: [PATCH 2/2] Convert merge actions with 'M' --- pycvsanaly2/DBContentHandler.py | 2 +- pycvsanaly2/Database.py | 4 ++-- pycvsanaly2/GitParser.py | 11 ++++++++++- pycvsanaly2/extensions/Content.py | 9 ++------- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/pycvsanaly2/DBContentHandler.py b/pycvsanaly2/DBContentHandler.py index c928731..9a31a3e 100644 --- a/pycvsanaly2/DBContentHandler.py +++ b/pycvsanaly2/DBContentHandler.py @@ -609,7 +609,7 @@ def commit(self, commit): if action.type == 'A': # A file has been added file_id = self.__action_add(path, prefix, log) - elif action.type.find('M') >= 0: + elif action.type == 'M': # A file has been modified file_id = self.__get_file_for_path(path, log.id)[0] elif action.type == 'D': diff --git a/pycvsanaly2/Database.py b/pycvsanaly2/Database.py index a138419..46b1903 100644 --- a/pycvsanaly2/Database.py +++ b/pycvsanaly2/Database.py @@ -407,7 +407,7 @@ def create_tables(self, cursor): ")") cursor.execute("CREATE TABLE actions (" + "id integer primary key," + - "type varchar(2)," + + "type varchar(1)," + "file_id integer," + "commit_id integer," + "branch_id integer" + @@ -556,7 +556,7 @@ def create_tables(self, cursor): " CHARACTER SET=utf8") cursor.execute("CREATE TABLE actions (" + "id INT," + - "type varchar(2)," + + "type varchar(1)," + "file_id integer," + "commit_id integer," + "branch_id integer," + diff --git a/pycvsanaly2/GitParser.py b/pycvsanaly2/GitParser.py index d282527..b9b530e 100644 --- a/pycvsanaly2/GitParser.py +++ b/pycvsanaly2/GitParser.py @@ -237,7 +237,16 @@ def _parse_line(self, line): match = self.patterns['file'].match(line) if match: action = Action() - action.type = match.group(1) + type = match.group(1) + if len(type) > 1: + # merge actions + if 'M' in type: + type = 'M' + else: + # ignore merge actions without 'M' + return + + action.type = type action.f1 = match.group(2) self.commit.actions.append(action) diff --git a/pycvsanaly2/extensions/Content.py b/pycvsanaly2/extensions/Content.py index 70d169f..47fc1a7 100644 --- a/pycvsanaly2/extensions/Content.py +++ b/pycvsanaly2/extensions/Content.py @@ -137,13 +137,8 @@ def _get_file_contents(self): # utf-8, ie. it's not already unicode, or latin-1, or something # obvious. This almost always means that the file isn't source # code at all. - # TODO: I should really throw a "not source" exception, - # but just doing None is fine for now. - try: - return to_utf8(self._file_contents).encode("utf-8") - except: - return None - + return to_utf8(self._file_contents) + def _set_file_contents(self, contents): self._file_contents = contents