Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add generic importer source #62

Merged
merged 15 commits into from
Aug 7, 2020
Merged
188 changes: 188 additions & 0 deletions beancount_import/source/generic_importer_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
"""This module implements a Source Subclass for wrapping
`beancount.ingest.importer.ImporterProtocol` subclasses importers.
The importers are considered athoritative of the account they represent.

The Transaction.narration set by each importer is copied to Posting.meta[source_desc]
This helps in predicting postings for similar transaction while allowing the
user to change the Transaction description and payee from UI
(see readme.md for more on source_desc)
This `source_desc` meta is also used for check cleared postings and should not be
changed manually

Author: Sufiyan Adhikari(github.com/dumbPy)
"""

import os
import hashlib
from glob import glob
from typing import List
from collections import defaultdict
import itertools
import datetime

from beancount.core.data import Transaction, Posting, Directive
from beancount.core import data
from beancount.core.amount import Amount
from beancount.ingest.importer import ImporterProtocol
from beancount.core.compare import hash_entry
from beancount.ingest.cache import get_file
from beancount.ingest.similar import find_similar_entries, SimilarityComparator

from ..matching import FIXME_ACCOUNT, SimpleInventory
from . import ImportResult, Source, SourceResults, InvalidSourceReference, AssociatedData
from ..journal_editor import JournalEditor
from .description_based_source import DescriptionBasedSource


class ImporterSource(DescriptionBasedSource):
def __init__(self,
directory: str,
account: str,
importer: ImporterProtocol,
**kwargs) -> None:
super().__init__(**kwargs)
self.directory = os.path.expanduser(directory)
self.importer = importer
self.account = account

self._comparator = SimilarityComparator()

# get _FileMemo object for each file
files = [get_file(f) for f in
filter(os.path.isfile,
glob(os.path.join(directory, '**', '*'), recursive=True)
)
]
# filter the valid files for this importer
self.files = [f for f in files if self.importer.identify(f)]

@property
def name(self):
return self.importer.name()

def prepare(self, journal: 'JournalEditor', results: SourceResults) -> None:
results.add_account(self.account)
entries = defaultdict(list)
for f in self.files:
f_entries = self.importer.extract(f, existing_entries=journal.entries)
# collect all entries in current statement, grouped by hash
hashed_entries = defaultdict(list)
for entry in f_entries:
hash_ = self._hash_entry(entry, frozenset(['filename','lineno']))
dumbPy marked this conversation as resolved.
Show resolved Hide resolved
hashed_entries[hash_].append(entry)
# deduplicate across statements
for hash_ in hashed_entries:
# skip the existing entries from other statements. add remaining
n = len(entries[hash_])
entries[hash_].extend(hashed_entries[hash_][n:])

dumbPy marked this conversation as resolved.
Show resolved Hide resolved
uncleared_entries = defaultdict(list)
for hash_ in entries:
# number of matching cleared entries in journal
n = len(similar_entries_in_journal(entries[hash_][0],
journal.entries,
self.comparator))
# If journal has n cleared entries for this hash, pick remaining
for entry in entries[hash_][n:]:
# add importer name as sorce description to source postings
self._add_description(entry)
# balance amount
self.balance_amounts(entry)
uncleared_entries[hash_].append(entry)

results.add_pending_entries(
[ImportResult(entry.date, [entry], None)
for entry in itertools.chain.from_iterable(uncleared_entries.values())
]
)

def comparator(self, entry1, entry2):
"""Returns if the two entries are similar and 2nd entry is cleared.
The first entry is from new_entries and 2nd is from journal
"""
return self._comparator(entry1, entry2) \
and self.is_entry_cleared(entry2) \
and entry1.narration == entry2.postings[0].meta['source_desc']

def _add_description(self, entry: Transaction):
if not isinstance(entry, Transaction): return None
postings = entry.postings #type: ignore
to_mutate = []
for i, posting in enumerate(postings):
dumbPy marked this conversation as resolved.
Show resolved Hide resolved
if posting.account != self.account: continue
if isinstance(posting.meta, dict): posting.meta["source_desc"] = entry.narration
dumbPy marked this conversation as resolved.
Show resolved Hide resolved
else: to_mutate.append(i)
for i in to_mutate:
p = postings.pop(i)
p = Posting(p.account, p.units, p.cost, p.price, p.flag, {"source_desc":entry.narration})
postings.insert(i, p)

@staticmethod
def balance_amounts(txn:Transaction)-> None:
"""Add FIXME account for the remaing amount to balance accounts"""
inventory = SimpleInventory()
for posting in txn.postings:
inventory += posting.units
for currency in inventory:
txn.postings.append(
Posting(
account=FIXME_ACCOUNT,
units=Amount(currency=currency, number=-inventory[currency]),
cost=None,
price=None,
flag=None,
meta={},
))

@staticmethod
def _hash_entry(entry:Directive, exclude_meta_keys=frozenset()) -> str:
"""Similar to beancount.core.compare.hash_entry but can skip selective meta fields
the meta fields to be used for hashing should be in Transaction's meta, not Posting's meta
"""
if not isinstance(entry, Transaction): return hash_entry(entry)
h = hashlib.md5()
h.update(hash_entry(entry, exclude_meta=True).encode())
for key in entry.meta:
if key in exclude_meta_keys: continue
h.update(str(entry.meta[key]).encode())
return h.hexdigest()

def is_entry_cleared(self, entry: Transaction) -> bool:
dumbPy marked this conversation as resolved.
Show resolved Hide resolved
"""If an entry has a cleared posting, it is considered cleared"""
for posting in entry.postings:
if self.is_posting_cleared(posting): return True
return False

def is_posting_cleared(self, posting: Posting) -> bool:
"""Given than this source is athoritative of the account of a particular posting,
return if that posting is cleared.
All postings which have `source_desc` meta key are considered cleared
"""
if posting.account != self.account: return False
return super().is_posting_cleared(posting)

def similar_entries_in_journal(entry:Transaction, source_entries:List[Directive],
comparator=None, window_days=2) -> List[Transaction]:
"""Given a hashed entry, find the similar entries in the journal
This is a rewrite of beancount.ingest.similar.find_similar_entries
to get all possible matches for a single new entry
"""
window_head = datetime.timedelta(days=window_days)
window_tail = datetime.timedelta(days=window_days + 1)

if comparator is None:
comparator = SimilarityComparator()

# Look at existing entries at a nearby date.
duplicates = []
for source_entry in data.filter_txns(
data.iter_entry_dates(source_entries,
entry.date - window_head,
entry.date + window_tail)):
if comparator(entry, source_entry):
duplicates.append(source_entry)
return duplicates


def load(spec, log_status):
return ImporterSource(log_status=log_status, **spec)
37 changes: 37 additions & 0 deletions beancount_import/source/generic_importer_source_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import os

import pytest

from .source_test import check_source_example
from beancount.ingest.importers.csv import Importer as CSVImporter, Col

testdata_dir = os.path.realpath(
os.path.join(
os.path.dirname(__file__), '..', '..', 'testdata', 'source', 'generic_importer'))

examples = [
'test_basic',
'test_training_examples'
]

importer = CSVImporter({Col.DATE: 'Date',
Col.NARRATION1: 'Description',
Col.AMOUNT: 'Amount',
},
'Assets:Bank',
'USD',
'"Date","Description","Amount"',
)


@pytest.mark.parametrize('name', examples)
def test_source(name: str):
check_source_example(
example_dir=os.path.join(testdata_dir, name),
source_spec={
'module': 'beancount_import.source.generic_importer_source',
'directory': testdata_dir,
'account': 'Assets:Bank',
'importer': importer,
},
replacements=[(testdata_dir, '<testdata>')])
7 changes: 7 additions & 0 deletions testdata/source/generic_importer/generic_statement.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"Date","Description","Amount"
2020-01-01,by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-,-1
2020-01-01,by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-,-1
2020-01-02,BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-,1
2020-01-02,ATM-WD Some Random ATM Machine,500
2020-01-02,BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-,1
2020-01-05,Transfer to 1234567890123,300
1 change: 1 addition & 0 deletions testdata/source/generic_importer/test_basic/accounts.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Assets:Bank
107 changes: 107 additions & 0 deletions testdata/source/generic_importer/test_basic/import_results.beancount
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
;; date: 2020-01-01
;; info: null

; features: [
; {
; "amount": "-1 USD",
; "date": "2020-01-01",
; "key_value_pairs": {
; "desc": "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-"
; },
; "source_account": "Assets:Bank"
; }
; ]
2020-01-01 * "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-"
Assets:Bank -1 USD
source_desc: "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-"
Expenses:FIXME 1 USD

;; date: 2020-01-01
;; info: null

; features: [
; {
; "amount": "-1 USD",
; "date": "2020-01-01",
; "key_value_pairs": {
; "desc": "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-"
; },
; "source_account": "Assets:Bank"
; }
; ]
2020-01-01 * "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-"
Assets:Bank -1 USD
source_desc: "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-"
Expenses:FIXME 1 USD

;; date: 2020-01-02
;; info: null

; features: [
; {
; "amount": "1 USD",
; "date": "2020-01-02",
; "key_value_pairs": {
; "desc": "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-"
; },
; "source_account": "Assets:Bank"
; }
; ]
2020-01-02 * "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-"
Assets:Bank 1 USD
source_desc: "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-"
Expenses:FIXME -1 USD

;; date: 2020-01-02
;; info: null

; features: [
; {
; "amount": "1 USD",
; "date": "2020-01-02",
; "key_value_pairs": {
; "desc": "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-"
; },
; "source_account": "Assets:Bank"
; }
; ]
2020-01-02 * "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-"
Assets:Bank 1 USD
source_desc: "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-"
Expenses:FIXME -1 USD

;; date: 2020-01-02
;; info: null

; features: [
; {
; "amount": "500 USD",
; "date": "2020-01-02",
; "key_value_pairs": {
; "desc": "ATM-WD Some Random ATM Machine"
; },
; "source_account": "Assets:Bank"
; }
; ]
2020-01-02 * "ATM-WD Some Random ATM Machine"
Assets:Bank 500 USD
source_desc: "ATM-WD Some Random ATM Machine"
Expenses:FIXME -500 USD

;; date: 2020-01-05
;; info: null

; features: [
; {
; "amount": "300 USD",
; "date": "2020-01-05",
; "key_value_pairs": {
; "desc": "Transfer to 1234567890123"
; },
; "source_account": "Assets:Bank"
; }
; ]
2020-01-05 * "Transfer to 1234567890123"
Assets:Bank 300 USD
source_desc: "Transfer to 1234567890123"
Expenses:FIXME -300 USD
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1900-01-01 open Assets:Bank
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Assets:Bank
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
1900-01-01 open Assets:Bank
1900-01-01 open Assets:Cash
1900-01-01 open Expenses:Misc
1900-01-01 open Liabilities:JohnDoe


2020-01-01 * "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-"
Assets:Bank -1 USD
source_desc: "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-"
cleared: TRUE
Expenses:Misc 1 USD

2020-01-01 * "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-"
Assets:Bank -1 USD
source_desc: "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-"
cleared: TRUE
Expenses:Misc 1 USD

2020-01-02 * "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-"
Assets:Bank 1 USD
source_desc: "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-"
cleared: TRUE
Expenses:Misc -1 USD

2020-01-02 * "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-"
Assets:Bank 1 USD
source_desc: "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-"
cleared: TRUE
Expenses:Misc -1 USD

2020-01-02 * "ATM-WD Some Random ATM Machine"
Assets:Bank 500 USD
source_desc: "ATM-WD Some Random ATM Machine"
cleared: TRUE
Assets:Cash -500 USD

2020-01-05 * "Transfer to 1234567890123"
Assets:Bank 300 USD
source_desc: "Transfer to 1234567890123"
cleared: TRUE
Liabilities:JohnDoe -300 USD
Loading