Skip to content

Commit

Permalink
Implement soft merge, configurable merge options
Browse files Browse the repository at this point in the history
Timeliner has always had the ability to merge items: if reprocessing, new items would overwrite data from existing items with the same ID. Now, identical items can be merged, including a new mode called "soft merge" which works on items with different IDs that are similar enough to be considered identical.

For example, Google Photos can be downloaded via the API or imported via a Takeout archive. While the Takeout archive provides location metadata, unfortunately the Takeout archive does not provide IDs, so using both the API and Takeout would duplicate the entire library. Enabling soft merging will compare the timestamp and filename of each item and, if identical, consider them to be identical, and will combine them. Yay!

This also made it necessary to configure which values are preferred for certain fields, for example the old or new ID, the old or new data file, etc.

This is a big refactor and likely introduced some bugs but it worked in my initial, tired testing well after midnight.
  • Loading branch information
mholt committed Dec 19, 2020
1 parent c2d1332 commit 6ef78cb
Show file tree
Hide file tree
Showing 17 changed files with 352 additions and 133 deletions.
91 changes: 57 additions & 34 deletions cmd/timeliner/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ func init() {
flag.BoolVar(&prune, "prune", prune, "When finishing, delete items not found on remote (download-all or import only)")
flag.BoolVar(&integrity, "integrity", integrity, "Perform integrity check on existing items and reprocess if needed (download-all or import only)")
flag.BoolVar(&reprocess, "reprocess", reprocess, "Reprocess every item that has not been modified locally (download-all or import only)")
flag.BoolVar(&softMerge, "softmerge", softMerge, "Merge incoming data with existing row using 'soft' keys (account ID + item timestamp + one of text, filename, and hash)")
flag.StringVar(&keep, "keep", keep, "Comma-separated list of existing values to keep if merge is performed (preferring existing value): id,ts,text,file")

flag.StringVar(&tfStartInput, "start", "", "Timeframe start (relative=duration, absolute=YYYY/MM/DD)")
flag.StringVar(&tfEndInput, "end", "", "Timeframe end (relative=duration, absolute=YYYY/MM/DD)")
Expand Down Expand Up @@ -111,6 +113,37 @@ func main() {
return
}

// get the timeframe within which to constrain item processing (multiple commands use this)
tf, err := parseTimeframe()
if err != nil {
log.Fatalf("[FATAL] %v", err)
}

// make the processing options
mergeOptions := timeliner.MergeOptions{SoftMerge: softMerge}
keepFields := strings.Split(keep, ",")
for _, val := range keepFields {
switch val {
case "id":
mergeOptions.PreferExistingID = true
case "ts":
mergeOptions.PreferExistingTimestamp = true
case "text":
mergeOptions.PreferExistingDataText = true
case "file":
mergeOptions.PreferExistingDataFile = true
default:
log.Fatalf("[FATAL] Unrecognized value for 'keep' argument: '%s'", val)
}
}
procOpt := timeliner.ProcessingOptions{
Reprocess: reprocess,
Prune: prune,
Integrity: integrity,
Timeframe: tf,
Merge: mergeOptions,
}

// make a client for each account
var clients []timeliner.WrappedClient
for _, a := range accounts {
Expand All @@ -133,15 +166,10 @@ func main() {

switch subcmd {
case "get-latest":
if reprocess || prune || integrity || tfStartInput != "" {
if procOpt.Reprocess || procOpt.Prune || procOpt.Integrity || procOpt.Timeframe.Since != nil {
log.Fatalf("[FATAL] The get-latest subcommand does not support -reprocess, -prune, -integrity, or -start")
}

_, tfEnd, err := parseTimeframe()
if err != nil {
log.Fatalf("[FATAL] %v", err)
}

var wg sync.WaitGroup
for _, wc := range clients {
wg.Add(1)
Expand All @@ -152,7 +180,7 @@ func main() {
if retryNum > 0 {
log.Println("[INFO] Retrying command")
}
err := wc.GetLatest(ctx, tfEnd)
err := wc.GetLatest(ctx, tf.Until)
if err != nil {
log.Printf("[ERROR][%s/%s] Getting latest: %v",
wc.DataSourceID(), wc.UserID(), err)
Expand All @@ -169,11 +197,6 @@ func main() {
wg.Wait()

case "get-all":
tfStart, tfEnd, err := parseTimeframe()
if err != nil {
log.Fatalf("[FATAL] %v", err)
}

var wg sync.WaitGroup
for _, wc := range clients {
wg.Add(1)
Expand All @@ -184,7 +207,7 @@ func main() {
if retryNum > 0 {
log.Println("[INFO] Retrying command")
}
err := wc.GetAll(ctx, reprocess, prune, integrity, timeliner.Timeframe{Since: tfStart, Until: tfEnd})
err := wc.GetAll(ctx, procOpt)
if err != nil {
log.Printf("[ERROR][%s/%s] Downloading all: %v",
wc.DataSourceID(), wc.UserID(), err)
Expand All @@ -205,7 +228,7 @@ func main() {
wc := clients[0]

ctx, cancel := context.WithCancel(context.Background())
err = wc.Import(ctx, file, reprocess, prune, integrity)
err = wc.Import(ctx, file, procOpt)
if err != nil {
log.Printf("[ERROR][%s/%s] Importing: %v",
wc.DataSourceID(), wc.UserID(), err)
Expand All @@ -218,44 +241,42 @@ func main() {
}

// parseTimeframe parses tfStartInput and/or tfEndInput and returns
// the resulting start and end times (may be nil), or an error.
func parseTimeframe() (start, end *time.Time, err error) {
var tfStart, tfEnd time.Time
// the resulting timeframe or an error.
func parseTimeframe() (timeliner.Timeframe, error) {
var tf timeliner.Timeframe
var timeStart, timeEnd time.Time

if tfStartInput != "" {
var tfStartRel time.Duration
tfStartRel, err = time.ParseDuration(tfStartInput)
tfStartRel, err := time.ParseDuration(tfStartInput)
if err == nil {
tfStart = time.Now().Add(tfStartRel)
timeStart = time.Now().Add(tfStartRel)
} else {
tfStart, err = time.Parse(dateFormat, tfStartInput)
timeStart, err = time.Parse(dateFormat, tfStartInput)
if err != nil {
err = fmt.Errorf("bad timeframe start value '%s': %v", tfStartInput, err)
return
return tf, fmt.Errorf("bad timeframe start value '%s': %v", tfStartInput, err)
}
}
start = &tfStart
tf.Since = &timeStart
}

if tfEndInput != "" {
var tfEndRel time.Duration
tfEndRel, err = time.ParseDuration(tfEndInput)
tfEndRel, err := time.ParseDuration(tfEndInput)
if err == nil {
tfEnd = time.Now().Add(tfEndRel)
timeEnd = time.Now().Add(tfEndRel)
} else {
tfEnd, err = time.Parse(dateFormat, tfEndInput)
timeEnd, err = time.Parse(dateFormat, tfEndInput)
if err != nil {
err = fmt.Errorf("bad timeframe end value '%s': %v", tfEndInput, err)
return
return tf, fmt.Errorf("bad timeframe end value '%s': %v", tfEndInput, err)
}
}
end = &tfEnd
tf.Until = &timeEnd
}

if start != nil && end != nil && end.Before(*start) {
err = fmt.Errorf("end time must be after start time (start=%s end=%s)", start, end)
if tf.Since != nil && tf.Until != nil && tf.Until.Before(*tf.Since) {
return tf, fmt.Errorf("end time must be after start time (start=%s end=%s)", tf.Since, tf.Until)
}

return
return tf, nil
}

func loadConfig() error {
Expand Down Expand Up @@ -353,6 +374,8 @@ var (
integrity bool
prune bool
reprocess bool
softMerge bool
keep string

tfStartInput, tfEndInput string

Expand Down
3 changes: 1 addition & 2 deletions datasource.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,6 @@ type DataSource struct {
// authFunc gets the authentication function for this
// service. If s.Authenticate is set, it returns that;
// if s.OAuth2 is set, it uses a standard OAuth2 func.
// TODO: update godoc
func (ds DataSource) authFunc() AuthenticateFn {
if ds.Authenticate != nil {
return ds.Authenticate
Expand Down Expand Up @@ -191,7 +190,7 @@ type Client interface {
// timeliner.Checkpoint to set a checkpoint. Checkpoints are not
// required, but if the implementation sets checkpoints, it
// should be able to resume from one, too.
ListItems(ctx context.Context, itemChan chan<- *ItemGraph, opt Options) error
ListItems(ctx context.Context, itemChan chan<- *ItemGraph, opt ListingOptions) error
}

// Timeframe represents a start and end time and/or
Expand Down
2 changes: 1 addition & 1 deletion datasources/facebook/facebook.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ type Client struct {
}

// ListItems lists the items on the Facebook account.
func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.Options) error {
func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error {
defer close(itemChan)

if opt.Filename != "" {
Expand Down
2 changes: 1 addition & 1 deletion datasources/googlelocation/googlelocation.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func init() {
type Client struct{}

// ListItems lists items from the data source. opt.Filename must be non-empty.
func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.Options) error {
func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error {
defer close(itemChan)

if opt.Filename == "" {
Expand Down
2 changes: 1 addition & 1 deletion datasources/googlephotos/googlephotos.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ type Client struct {

// ListItems lists items from the data source.
// opt.Timeframe precision is day-level at best.
func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.Options) error {
func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error {
defer close(itemChan)

if opt.Filename != "" {
Expand Down
29 changes: 19 additions & 10 deletions datasources/googlephotos/takeoutarchive.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import (
"github.com/mholt/timeliner"
)

func (c *Client) listFromTakeoutArchive(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.Options) error {
func (c *Client) listFromTakeoutArchive(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error {
err := archiver.Walk(opt.Filename, func(f archiver.File) error {
pathInArchive := getPathInArchive(f) // TODO: maybe this should be a function in the archiver lib

Expand Down Expand Up @@ -69,20 +69,27 @@ func (c *Client) listFromTakeoutArchive(ctx context.Context, itemChan chan<- *ti
itemMeta.pathInArchive = strings.TrimSuffix(pathInArchive, ".json")
itemMeta.archiveFilename = opt.Filename

collection.Items = append(collection.Items, timeliner.CollectionItem{
Item: itemMeta,
Position: len(collection.Items),
})
withinTimeframe := (opt.Timeframe.Since == nil || itemMeta.parsedPhotoTakenTime.After(*opt.Timeframe.Since)) &&
(opt.Timeframe.Until == nil || itemMeta.parsedPhotoTakenTime.Before(*opt.Timeframe.Until))

if withinTimeframe {
collection.Items = append(collection.Items, timeliner.CollectionItem{
Item: itemMeta,
Position: len(collection.Items),
})
}

return nil
})
if err != nil {
return err
}

ig := timeliner.NewItemGraph(nil)
ig.Collections = append(ig.Collections, collection)
itemChan <- ig
if len(collection.Items) > 0 {
ig := timeliner.NewItemGraph(nil)
ig.Collections = append(ig.Collections, collection)
itemChan <- ig
}

return nil
})
Expand All @@ -99,7 +106,7 @@ func getPathInArchive(f archiver.File) string {
switch hdr := f.Header.(type) {
case zip.FileHeader:
return hdr.Name
case tar.Header:
case *tar.Header:
return hdr.Name
}
return ""
Expand Down Expand Up @@ -187,8 +194,10 @@ func (m mediaArchiveMetadata) timestamp() (time.Time, error) {
return time.Unix(parsed, 0), nil
}

// ID does NOT return the same ID as from the API. Takeout archives do NOT
// have an ID associated with each item, so we do our best by making up
// an ID using the timestamp and the filename.
func (m mediaArchiveMetadata) ID() string {
// TODO: THIS IS NOT THE SAME AS THE ID FROM THE API
return m.PhotoTakenTime.Timestamp + "_" + m.Title
}

Expand Down
2 changes: 1 addition & 1 deletion datasources/instagram/instagram.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func init() {
type Client struct{}

// ListItems lists items from the data source. opt.Filename must be non-empty.
func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.Options) error {
func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error {
defer close(itemChan)

if opt.Filename == "" {
Expand Down
2 changes: 1 addition & 1 deletion datasources/smsbackuprestore/smsbackuprestore.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ type Client struct {
}

// ListItems lists items from the data source.
func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.Options) error {
func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error {
defer close(itemChan)

if opt.Filename == "" {
Expand Down
2 changes: 1 addition & 1 deletion datasources/twitter/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (
"github.com/mholt/timeliner"
)

func (c *Client) getFromAPI(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.Options) error {
func (c *Client) getFromAPI(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error {
// load any previous checkpoint
c.checkpoint.load(opt.Checkpoint)

Expand Down
2 changes: 1 addition & 1 deletion datasources/twitter/archives.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (
"github.com/mholt/timeliner"
)

func (c *Client) getFromArchiveFile(itemChan chan<- *timeliner.ItemGraph, opt timeliner.Options) error {
func (c *Client) getFromArchiveFile(itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error {
// load the user's account ID
var err error
c.ownerAccount, err = c.getOwnerAccountFromArchive(opt.Filename)
Expand Down
2 changes: 1 addition & 1 deletion datasources/twitter/twitter.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ type Client struct {
}

// ListItems lists items from opt.Filename if specified, or from the API otherwise.
func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.Options) error {
func (c *Client) ListItems(ctx context.Context, itemChan chan<- *timeliner.ItemGraph, opt timeliner.ListingOptions) error {
defer close(itemChan)

if opt.Filename != "" {
Expand Down
5 changes: 3 additions & 2 deletions db.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ CREATE TABLE IF NOT EXISTS "items" (
"account_id" INTEGER NOT NULL,
"original_id" TEXT NOT NULL, -- ID provided by the data source
"person_id" INTEGER NOT NULL,
"timestamp" INTEGER, -- timestamp when item content was originally created
"stored" INTEGER NOT NULL DEFAULT (strftime('%s', CURRENT_TIME)), -- timestamp row was created - TODO not sure if needed
"timestamp" INTEGER, -- timestamp when item content was originally created (NOT when the database row was created)
"stored" INTEGER NOT NULL DEFAULT (strftime('%s', CURRENT_TIME)), -- timestamp row was created or last updated from source
"modified" INTEGER, -- timestamp when item was locally modified; if not null, then item is "not clean"
"class" INTEGER,
"mime_type" TEXT,
Expand All @@ -106,6 +106,7 @@ CREATE TABLE IF NOT EXISTS "items" (
CREATE INDEX IF NOT EXISTS "idx_items_timestamp" ON "items"("timestamp");
CREATE INDEX IF NOT EXISTS "idx_items_data_text" ON "items"("data_text");
CREATE INDEX IF NOT EXISTS "idx_items_data_file" ON "items"("data_file");
CREATE INDEX IF NOT EXISTS "idx_items_data_hash" ON "items"("data_hash");
-- Relationships draws relationships between and across items and persons.
Expand Down
1 change: 0 additions & 1 deletion itemfiles.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ func (t *Timeline) openUniqueCanonicalItemDataFile(it Item, dataSourceID string)
tryPath = strings.TrimSuffix(tryPath, lastAppend)
lastAppend = fmt.Sprintf("_%d%s", i+1, ext) // start at 1, but actually 2 because existing file is "1"
tryPath += lastAppend

continue
}
if err != nil {
Expand Down
3 changes: 2 additions & 1 deletion itemgraph.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ type ItemRow struct {
Location

metaGob []byte // use Metadata.(encode/decode)
item Item
}

// Location contains location information.
Expand Down Expand Up @@ -302,7 +303,7 @@ type Relation struct {
Bidirectional bool
}

// Collection represents a group of items.
// Collection represents a group of items, like an album.
type Collection struct {
// The ID of the collection as given
// by the service; for example, the
Expand Down
Loading

0 comments on commit 6ef78cb

Please sign in to comment.