Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

when converting sha1 hash to filename, use first 2 charactors as fold… #87

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 42 additions & 18 deletions git-fat
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,16 @@ except ImportError:

BLOCK_SIZE = 4096

def hash2fname(hvalue):
"""
convert sha1 hash to filename. Follow git convention
of using first 2 charactors as folder name
"""
fname = os.path.join(hvalue[:2], hvalue[2:])
# print(fname)
#fname = hvalue
return fname

def verbose_stderr(*args, **kwargs):
return print(*args, file=sys.stderr, **kwargs)
def verbose_ignore(*args, **kwargs):
Expand Down Expand Up @@ -132,14 +142,11 @@ class GitFat(object):
sys.exit(1)
self.gitdir = subprocess.check_output('git rev-parse --git-dir'.split()).strip()
self.objdir = os.path.join(self.gitdir, 'fat', 'objects')
if os.environ.get('GIT_FAT_VERSION') == '1':
self.encode = self.encode_v1
else:
self.encode = self.encode_v2
self.encode = self.encode_v2
def magiclen(enc):
return len(enc(hashlib.sha1('dummy').hexdigest(), 5))
self.magiclen = magiclen(self.encode) # Current version
self.magiclens = [magiclen(enc) for enc in [self.encode_v1, self.encode_v2]] # All prior versions
self.magiclens = [magiclen(enc) for enc in [self.encode_v2]] # All prior versions
def setup(self):
mkdir_p(self.objdir)
def is_init_done(self):
Expand Down Expand Up @@ -182,9 +189,6 @@ class GitFat(object):
return cmd
def revparse(self, revname):
return subprocess.check_output(['git', 'rev-parse', revname]).strip()
def encode_v1(self, digest, bytes):
'Produce legacy representation of file to be stored in repository.'
return '#$# git-fat %s\n' % (digest,)
def encode_v2(self, digest, bytes):
'Produce representation of file to be stored in repository. 20 characters can hold 64-bit integers.'
return '#$# git-fat %s %20d\n' % (digest, bytes)
Expand Down Expand Up @@ -254,14 +258,16 @@ class GitFat(object):
outstream.write(block)
outstream.flush()
digest = h.hexdigest()
objfile = os.path.join(self.objdir, digest)
#objfile = os.path.join(self.objdir, digest)
objfile = os.path.join(self.objdir, hash2fname(digest))
if not ishanging:
if os.path.exists(objfile):
self.verbose('git-fat filter-clean: cache already exists %s' % objfile)
os.remove(tmpname)
else:
# Set permissions for the new file using the current umask
os.chmod(tmpname, int('444', 8) & ~umask())
mkdir_p(os.path.dirname(objfile))
os.rename(tmpname, objfile)
self.verbose('git-fat filter-clean: caching to %s' % objfile)
cached = True
Expand All @@ -282,7 +288,7 @@ class GitFat(object):
self.setup()
result, bytes = self.decode_stream(sys.stdin)
if isinstance(result, str): # We got a digest
objfile = os.path.join(self.objdir, result)
objfile = os.path.join(self.objdir, hash2fname(result))
try:
cat(open(objfile), sys.stdout)
self.verbose('git-fat filter-smudge: restoring from %s' % objfile)
Expand All @@ -293,7 +299,17 @@ class GitFat(object):
self.verbose('git-fat filter-smudge: not a managed file')
cat_iter(result, sys.stdout)
def catalog_objects(self):
return set(os.listdir(self.objdir))
#return set(os.listdir(self.objdir))
#sha1 is split into 8bits folder name, plus filename
dirlist = os.listdir(self.objdir)
catalog = set()
for x in os.listdir(self.objdir):
subdir = os.path.join(self.objdir, x)
if len(x) == 2 and os.path.isdir(subdir):
for y in os.listdir(subdir):
catalog.add(os.path.join(x,y))
return catalog

def referenced_objects(self, rev=None, all=False):
referenced = set()
if all:
Expand Down Expand Up @@ -338,8 +354,10 @@ class GitFat(object):
content += data
bytes_read += len(data)
try:
#print('content: ' + content)
fathash = self.decode(content)[0]
referenced.add(fathash)
#referenced.add(fathash)
referenced.add(hash2fname(fathash))
except GitFat.DecodeError:
pass
# Consume LF record delimiter in `cat-file --batch` output
Expand All @@ -355,6 +373,7 @@ class GitFat(object):
p1.wait()
p2.wait()
p3.wait()
#print('referenced: ' + str(referenced))
return referenced

def orphan_files(self, patterns=[]):
Expand Down Expand Up @@ -395,19 +414,23 @@ class GitFat(object):
# (includes history). Finer-grained pushing would be useful.
pushall = '--all' in args
files = self.referenced_objects(all=pushall) & self.catalog_objects()
print('file length %d'%len(files))
cmd = self.get_rsync_command(push=True)
self.verbose('Executing: %s' % ' '.join(cmd))
p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
p.communicate(input='\x00'.join(files))
#print('files list is: ' + str(files))
if p.returncode:
sys.exit(p.returncode)
def checkout(self, show_orphans=False):
'Update any stale files in the present working tree'
self.assert_init_done()
for digest, fname in self.orphan_files():
objpath = os.path.join(self.objdir, digest)
#objpath = os.path.join(self.objdir, digest)
objname = hash2fname(digest)
objpath = os.path.join(self.objdir, objname)
if os.access(objpath, os.R_OK):
print('Restoring %s -> %s' % (digest, fname))
print('Restoring %s -> %s' % (objname, fname))
# The output of our smudge filter depends on the existence of
# the file in .git/fat/objects, but git caches the file stat
# from the previous time the file was smudged, therefore it
Expand All @@ -425,7 +448,7 @@ class GitFat(object):
subprocess.check_call(
['git', 'checkout-index', '--index', '--force', fname])
elif show_orphans:
print('Data unavailable: %s %s' % (digest,fname))
print('Data unavailable: %s %s' % (objname,fname))
def cmd_pull(self, args):
'Pull anything that I have referenced, but not stored'
self.setup()
Expand All @@ -439,6 +462,7 @@ class GitFat(object):
if rev:
refargs['rev'] = rev
files = self.filter_objects(refargs, self.parse_pull_patterns(args))
#print('filters: ' + str(files))
cmd = self.get_rsync_command(push=False)
self.verbose('Executing: %s' % ' '.join(cmd))
p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
Expand All @@ -460,7 +484,7 @@ class GitFat(object):
if refargs.get('all'): # Currently ignores patterns; can we efficiently do both?
return files
orphans_matched = list(self.orphan_files(patterns))
orphans_objects = set(map(lambda x: x[0], orphans_matched))
orphans_objects = set(map(lambda x: hash2fname(x[0]), orphans_matched))
return files & orphans_objects

def cmd_checkout(self, args):
Expand Down Expand Up @@ -546,7 +570,7 @@ class GitFat(object):
self.verbose('Found %d paths in %.3f s' % (len(pathsizes), time1-time0))
maxlen = max(map(len,pathsizes)) if pathsizes else 0
for path, sizes in sorted(pathsizes.items(), key=lambda ps: max(ps[1]), reverse=True):
print('%-*s filter=fat -text # %10d %d' % (maxlen, path,max(sizes),len(sizes)))
print('%-*s filter.fat -text # %10d %d' % (maxlen, path,max(sizes),len(sizes)))
revlist.wait()
difftree.wait()
def cmd_index_filter(self, args):
Expand Down Expand Up @@ -590,7 +614,7 @@ class GitFat(object):
except ValueError: # Nothing to unpack, thus no file
mode, stageno = '100644', '0'
gitattributes_lines = []
gitattributes_extra = ['%s filter=fat -text' % line.split()[0] for line in filelist]
gitattributes_extra = ['%s filter.fat -text' % line.split()[0] for line in filelist]
hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
stdout, stderr = hashobject.communicate('\n'.join(gitattributes_lines + gitattributes_extra) + '\n')
updateindex.stdin.write('%s %s %s\t%s\n' % (mode, stdout.strip(), stageno, '.gitattributes'))
Expand Down
51 changes: 51 additions & 0 deletions updatefatstorage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/python

"""
script to update fat storage file name from something like

a08929b5b00f6e9fbb60e013a0024805c75e9d42
to
a0/8929b5b00f6e9fbb60e013a0024805c75e9d42

If one folder has too many files, the performance won't be very good.

git-fat design is rather simple, to update storage, assume your files are
stored in
/git_storage_folder

Run the following command:

cd /git_storage_folder
updatefatstorage.py

You need to change your local storage format too
assume your git working tree is
/your_git_working_tree

cd /your_git_working_tree/.git/fat/objects
updatefatstorage.py

G. T.
1-23-2019

"""

import os

def mkdir_p(path):
import errno
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else: raise

flist = os.listdir('.')
for fname in flist:
if len(fname) == 40:
ofname = fname[:2] + '/' + fname[2:]
mkdir_p(os.path.dirname(ofname))
os.rename(fname, ofname)
print(ofname)