From ef2879f7a5b349f167be8fc3f54ccb51769688c4 Mon Sep 17 00:00:00 2001 From: Sufiyan Adhikari Date: Sun, 2 Aug 2020 16:17:20 +0530 Subject: [PATCH 01/15] Add generic importer source that imports transactions from beancount.ingest.importer.ImporterProtocol subclass importers --- .../source/generic_importer_source.py | 142 ++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 beancount_import/source/generic_importer_source.py diff --git a/beancount_import/source/generic_importer_source.py b/beancount_import/source/generic_importer_source.py new file mode 100644 index 00000000..30c70d3a --- /dev/null +++ b/beancount_import/source/generic_importer_source.py @@ -0,0 +1,142 @@ +"""This module implements a Source Subclass for wrapping +`beancount.ingest.importer.ImporterProtocol` subclasses importers. +The importers are considered athoritative of the account they represent. + +The Transaction.narration set by each importer is copied to Posting.meta[source_desc] +This helps in predicting postings for similar transaction while allowing the +user to change the Transaction description and payee from UI (see readme.md for more on source_desc) + +Author: Sufiyan Adhikari(github.com/dumbPy) +""" + +import os +import hashlib +from glob import glob + +from beancount.core.data import Transaction, Posting, Directive +from beancount.core.amount import Amount +from beancount.ingest.importer import ImporterProtocol +from beancount.core.compare import hash_entry +from beancount.ingest.cache import get_file +from beancount.ingest.similar import find_similar_entries, SimilarityComparator + +from ..matching import FIXME_ACCOUNT, SimpleInventory +from . import ImportResult, Source, SourceResults, InvalidSourceReference, AssociatedData +from ..journal_editor import JournalEditor + + +class ImporterSource(Source): + def __init__(self, + directory: str, + account: str , + importer: ImporterProtocol, + account_name:str = None, + **kwargs) -> None: + super().__init__(**kwargs) + self.directory = os.path.expanduser(directory) + self.importer = importer + self.account = account + self.account_name = account_name if account_name else self.name + + self.comparator = SimilarityComparator() + + # get _FileMemo object for each file + files = [get_file(f) for f in + glob(os.path.join(directory, '**', '*'), recursive=True) + ] + # filter the valid files for this importer + self.files = [f for f in files if self.importer.identify(f)] + + @property + def name(self): + return self.importer.name() + + def prepare(self, journal: 'JournalEditor', results: SourceResults) -> None: + results.add_account(self.account) + entries = {} + for f in self.files: + f_entries = self.importer.extract(f) + # deduplicate across statements + hashed_entries = {} + for entry in f_entries: + hash_ = self._hash_entry(entry, frozenset(['filename','lineno'])) + # skip the existing entries from other statements + if hash_ in entries: continue + # If the entry exists in the journal, skip + if self._is_existing(journal, entry): continue + # add importer name as sorce description to source postings + self._add_description(entry) + # balance amount + self.balance_amounts(entry) + hashed_entries[hash_] = entry + entries = {**entries, **hashed_entries} + + results.add_pending_entries( + [ImportResult(entry.date, [entry], None) + for entry in entries.values() + ] + ) + + def _is_existing(self, journal: 'JournalEditor', entry: Directive) -> bool: + """Check if the entry exists in journal""" + comp_result = find_similar_entries([entry], journal.entries, self.comparator, 0) + if comp_result: return True + return False + + + def _add_description(self, entry: Transaction): + if not isinstance(entry, Transaction): return None + postings = entry.postings #type: ignore + to_mutate = [] + for i, posting in enumerate(postings): + if isinstance(posting.meta, dict): posting.meta["source_desc"] = entry.narration + else: to_mutate.append(i) + for i in to_mutate: + p = postings.pop(i) + p = Posting(p.account, p.units, p.cost, p.price, p.flag, {"source_desc":entry.narration}) + postings.insert(i, p) + + @staticmethod + def balance_amounts(txn:Transaction)-> None: + """Add FIXME account for the remaing amount to balance accounts""" + inventory = SimpleInventory() + for posting in txn.postings: + inventory += posting.units + for currency in inventory: + txn.postings.append( + Posting( + account=FIXME_ACCOUNT, + units=Amount(currency=currency, number=-inventory[currency]), + cost=None, + price=None, + flag=None, + meta={}, + )) + + @staticmethod + def _hash_entry(entry:Directive, exclude_meta_keys=frozenset()) -> str: + """Similar to beancount.core.compare.hash_entry but can skip selective meta fields + the meta fields to be used for hashing should be in Transaction's meta, not Posting's meta + """ + if not isinstance(entry, Transaction): return hash_entry(entry) + h = hashlib.md5() + h.update(hash_entry(entry, exclude_meta=True).encode()) + for key in entry.meta: + if key in exclude_meta_keys: continue + h.update(str(entry.meta[key]).encode()) + return h.hexdigest() + + def is_posting_cleared(self, posting: Posting) -> bool: + """Given than this source is athoritative of the accoutn of a particular posting, + return if that posting is cleared. + This is an added layer of filter on what postings are used for training classifiers. + Each Individual Importer can either implement it if required or else + all postings from this importer are considered cleared by default + """ + if getattr(self.importer, 'is_posting_cleared', None): + return self.importer.is_posting_cleared(posting) + return True + + +def load(spec, log_status): + return ImporterSource(log_status=log_status, **spec) \ No newline at end of file From 76110c75c7bd97f57e4145644e435e87beac58cc Mon Sep 17 00:00:00 2001 From: Sufiyan Adhikari Date: Sun, 2 Aug 2020 17:41:02 +0530 Subject: [PATCH 02/15] Postings with transaction_desc should be cleared is_posting_cleared should check for source_desc and use that to determine if a posting is cleared. This avoids clearing manually entered postings and helps reconcile them --- .../source/generic_importer_source.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/beancount_import/source/generic_importer_source.py b/beancount_import/source/generic_importer_source.py index 30c70d3a..78ac8ecd 100644 --- a/beancount_import/source/generic_importer_source.py +++ b/beancount_import/source/generic_importer_source.py @@ -12,6 +12,7 @@ import os import hashlib from glob import glob +from typing import List, Tuple from beancount.core.data import Transaction, Posting, Directive from beancount.core.amount import Amount @@ -78,12 +79,15 @@ def prepare(self, journal: 'JournalEditor', results: SourceResults) -> None: ) def _is_existing(self, journal: 'JournalEditor', entry: Directive) -> bool: - """Check if the entry exists in journal""" - comp_result = find_similar_entries([entry], journal.entries, self.comparator, 0) - if comp_result: return True + """Check if the entry exists in journal and is cleared""" + matches:List[Tuple[Transaction, Transaction]] = \ + find_similar_entries([entry], journal.entries, self.comparator, 0) + if not matches: return False + for posting in matches[0][1].postings: + if self.is_posting_cleared(posting): + return True return False - def _add_description(self, entry: Transaction): if not isinstance(entry, Transaction): return None postings = entry.postings #type: ignore @@ -131,11 +135,13 @@ def is_posting_cleared(self, posting: Posting) -> bool: return if that posting is cleared. This is an added layer of filter on what postings are used for training classifiers. Each Individual Importer can either implement it if required or else - all postings from this importer are considered cleared by default + all postings which have `source_desc` meta key are considered cleared """ if getattr(self.importer, 'is_posting_cleared', None): return self.importer.is_posting_cleared(posting) - return True + if isinstance(posting.meta, dict) and "source_desc" in posting.meta: + return True + return False def load(spec, log_status): From e4d313478124e00a931ec40ab25643accad88641 Mon Sep 17 00:00:00 2001 From: Sufiyan Adhikari Date: Thu, 6 Aug 2020 14:32:33 +0530 Subject: [PATCH 03/15] Reimplement deduplication for statements & journal --- .../source/generic_importer_source.py | 91 ++++++++++++++----- 1 file changed, 66 insertions(+), 25 deletions(-) diff --git a/beancount_import/source/generic_importer_source.py b/beancount_import/source/generic_importer_source.py index 78ac8ecd..640ca434 100644 --- a/beancount_import/source/generic_importer_source.py +++ b/beancount_import/source/generic_importer_source.py @@ -4,7 +4,10 @@ The Transaction.narration set by each importer is copied to Posting.meta[source_desc] This helps in predicting postings for similar transaction while allowing the -user to change the Transaction description and payee from UI (see readme.md for more on source_desc) +user to change the Transaction description and payee from UI +(see readme.md for more on source_desc) +This `source_desc` meta is also used for check cleared postings and should not be +changed manually Author: Sufiyan Adhikari(github.com/dumbPy) """ @@ -12,9 +15,13 @@ import os import hashlib from glob import glob -from typing import List, Tuple +from typing import List +from collections import defaultdict +import itertools +import datetime from beancount.core.data import Transaction, Posting, Directive +from beancount.core import data from beancount.core.amount import Amount from beancount.ingest.importer import ImporterProtocol from beancount.core.compare import hash_entry @@ -29,7 +36,7 @@ class ImporterSource(Source): def __init__(self, directory: str, - account: str , + account: str, importer: ImporterProtocol, account_name:str = None, **kwargs) -> None: @@ -39,7 +46,7 @@ def __init__(self, self.account = account self.account_name = account_name if account_name else self.name - self.comparator = SimilarityComparator() + self._comparator = SimilarityComparator() # get _FileMemo object for each file files = [get_file(f) for f in @@ -54,39 +61,47 @@ def name(self): def prepare(self, journal: 'JournalEditor', results: SourceResults) -> None: results.add_account(self.account) - entries = {} + entries = defaultdict(list) for f in self.files: f_entries = self.importer.extract(f) - # deduplicate across statements - hashed_entries = {} + # collect all entries in current statement, grouped by hash + hashed_entries = defaultdict(list) for entry in f_entries: hash_ = self._hash_entry(entry, frozenset(['filename','lineno'])) - # skip the existing entries from other statements - if hash_ in entries: continue - # If the entry exists in the journal, skip - if self._is_existing(journal, entry): continue + hashed_entries[hash_].append(entry) + # deduplicate across statements + for hash_ in hashed_entries: + # skip the existing entries from other statements. add remaining + n = len(entries[hash_]) + entries[hash_].extend(hashed_entries[hash_][n:]) + + uncleared_entries = defaultdict(list) + for hash_ in entries: + # number of matching cleared entries in journal + n = len(similar_entries_in_journal(entries[hash_][0], + journal.entries, + self.comparator)) + # If journal has n cleared entries for this hash, pick remaining + for entry in entries[hash_][n:]: # add importer name as sorce description to source postings self._add_description(entry) # balance amount self.balance_amounts(entry) - hashed_entries[hash_] = entry - entries = {**entries, **hashed_entries} + uncleared_entries[hash_].append(entry) results.add_pending_entries( [ImportResult(entry.date, [entry], None) - for entry in entries.values() + for entry in itertools.chain.from_iterable(uncleared_entries.values()) ] ) - def _is_existing(self, journal: 'JournalEditor', entry: Directive) -> bool: - """Check if the entry exists in journal and is cleared""" - matches:List[Tuple[Transaction, Transaction]] = \ - find_similar_entries([entry], journal.entries, self.comparator, 0) - if not matches: return False - for posting in matches[0][1].postings: - if self.is_posting_cleared(posting): - return True - return False + def comparator(self, entry1, entry2): + """Returns if the two entries are similar and 2nd entry is cleared. + The first entry is from new_entries and 2nd is from journal + """ + return self._comparator(entry1, entry2) \ + and self.is_entry_cleared(entry2) \ + and entry1.narration == entry2.postings[0].meta['source_desc'] def _add_description(self, entry: Transaction): if not isinstance(entry, Transaction): return None @@ -130,10 +145,15 @@ def _hash_entry(entry:Directive, exclude_meta_keys=frozenset()) -> str: h.update(str(entry.meta[key]).encode()) return h.hexdigest() + def is_entry_cleared(self, entry: Transaction) -> bool: + """If an entry has a cleared posting, it is considered cleared""" + for posting in entry.postings: + if self.is_posting_cleared(posting): return True + return False + def is_posting_cleared(self, posting: Posting) -> bool: - """Given than this source is athoritative of the accoutn of a particular posting, + """Given than this source is athoritative of the account of a particular posting, return if that posting is cleared. - This is an added layer of filter on what postings are used for training classifiers. Each Individual Importer can either implement it if required or else all postings which have `source_desc` meta key are considered cleared """ @@ -143,6 +163,27 @@ def is_posting_cleared(self, posting: Posting) -> bool: return True return False +def similar_entries_in_journal(entry:Transaction, source_entries:List[Directive], + comparator=None, window_days=2) -> List[Transaction]: + """Given a hashed entry, find the similar entries in the journal + This is a rewrite of beancount.ingest.similar.find_similar_entries + to get all possible matches for a single new entry + """ + window_head = datetime.timedelta(days=window_days) + window_tail = datetime.timedelta(days=window_days + 1) + + if comparator is None: + comparator = SimilarityComparator() + + # Look at existing entries at a nearby date. + duplicates = [] + for source_entry in data.filter_txns( + data.iter_entry_dates(source_entries, + entry.date - window_head, + entry.date + window_tail)): + if comparator(entry, source_entry): + duplicates.append(source_entry) + return duplicates def load(spec, log_status): return ImporterSource(log_status=log_status, **spec) \ No newline at end of file From 17bb104b6c153807794fea5ea53e7d61848408b0 Mon Sep 17 00:00:00 2001 From: Sufiyan Adhikari Date: Thu, 6 Aug 2020 17:02:07 +0530 Subject: [PATCH 04/15] Only clear this account postings --- beancount_import/source/generic_importer_source.py | 1 + 1 file changed, 1 insertion(+) diff --git a/beancount_import/source/generic_importer_source.py b/beancount_import/source/generic_importer_source.py index 640ca434..fe01dff7 100644 --- a/beancount_import/source/generic_importer_source.py +++ b/beancount_import/source/generic_importer_source.py @@ -157,6 +157,7 @@ def is_posting_cleared(self, posting: Posting) -> bool: Each Individual Importer can either implement it if required or else all postings which have `source_desc` meta key are considered cleared """ + if posting.account != self.account: return False if getattr(self.importer, 'is_posting_cleared', None): return self.importer.is_posting_cleared(posting) if isinstance(posting.meta, dict) and "source_desc" in posting.meta: From 304f44e263569ed0bb3622c9bcdf600d597259e3 Mon Sep 17 00:00:00 2001 From: Sufiyan Adhikari Date: Thu, 6 Aug 2020 17:08:23 +0530 Subject: [PATCH 05/15] Importers need not implement is_posting_cleared source_desc is used for checking and clearing postings. this is the way!! --- beancount_import/source/generic_importer_source.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/beancount_import/source/generic_importer_source.py b/beancount_import/source/generic_importer_source.py index fe01dff7..1e43cc51 100644 --- a/beancount_import/source/generic_importer_source.py +++ b/beancount_import/source/generic_importer_source.py @@ -154,12 +154,9 @@ def is_entry_cleared(self, entry: Transaction) -> bool: def is_posting_cleared(self, posting: Posting) -> bool: """Given than this source is athoritative of the account of a particular posting, return if that posting is cleared. - Each Individual Importer can either implement it if required or else - all postings which have `source_desc` meta key are considered cleared + All postings which have `source_desc` meta key are considered cleared """ if posting.account != self.account: return False - if getattr(self.importer, 'is_posting_cleared', None): - return self.importer.is_posting_cleared(posting) if isinstance(posting.meta, dict) and "source_desc" in posting.meta: return True return False From ca47baa53453bc7705dee5368fd26fc1b921344b Mon Sep 17 00:00:00 2001 From: Sufiyan Adhikari Date: Thu, 6 Aug 2020 22:01:26 +0530 Subject: [PATCH 06/15] Inherit DescriptionBasedSource for ImporterSource --- beancount_import/source/generic_importer_source.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/beancount_import/source/generic_importer_source.py b/beancount_import/source/generic_importer_source.py index 1e43cc51..76c6427d 100644 --- a/beancount_import/source/generic_importer_source.py +++ b/beancount_import/source/generic_importer_source.py @@ -31,9 +31,10 @@ from ..matching import FIXME_ACCOUNT, SimpleInventory from . import ImportResult, Source, SourceResults, InvalidSourceReference, AssociatedData from ..journal_editor import JournalEditor +from .description_based_source import DescriptionBasedSource -class ImporterSource(Source): +class ImporterSource(DescriptionBasedSource): def __init__(self, directory: str, account: str, @@ -157,9 +158,7 @@ def is_posting_cleared(self, posting: Posting) -> bool: All postings which have `source_desc` meta key are considered cleared """ if posting.account != self.account: return False - if isinstance(posting.meta, dict) and "source_desc" in posting.meta: - return True - return False + return super().is_posting_cleared(posting) def similar_entries_in_journal(entry:Transaction, source_entries:List[Directive], comparator=None, window_days=2) -> List[Transaction]: @@ -184,4 +183,4 @@ def similar_entries_in_journal(entry:Transaction, source_entries:List[Directive] return duplicates def load(spec, log_status): - return ImporterSource(log_status=log_status, **spec) \ No newline at end of file + return ImporterSource(log_status=log_status, **spec) From bea4b479798a01f20169e99c7afd6342c753b4a0 Mon Sep 17 00:00:00 2001 From: Sufiyan Adhikari Date: Fri, 7 Aug 2020 00:44:13 +0530 Subject: [PATCH 07/15] Add tests and some very minor changes in ImporterSource --- .../source/generic_importer_source.py | 8 +- .../source/generic_importer_source_test.py | 37 ++++++ .../generic_importer/generic_statement.csv | 7 ++ .../generic_importer/test_basic/accounts.txt | 1 + .../test_basic/import_results.beancount | 107 ++++++++++++++++++ .../test_basic/journal.beancount | 1 + .../test_basic/training_examples.json | 1 + .../test_training_examples/accounts.txt | 1 + .../import_results.beancount | 1 + .../test_training_examples/journal.beancount | 41 +++++++ .../training_examples.json | 68 +++++++++++ 11 files changed, 270 insertions(+), 3 deletions(-) create mode 100644 beancount_import/source/generic_importer_source_test.py create mode 100644 testdata/source/generic_importer/generic_statement.csv create mode 100644 testdata/source/generic_importer/test_basic/accounts.txt create mode 100644 testdata/source/generic_importer/test_basic/import_results.beancount create mode 100644 testdata/source/generic_importer/test_basic/journal.beancount create mode 100644 testdata/source/generic_importer/test_basic/training_examples.json create mode 100644 testdata/source/generic_importer/test_training_examples/accounts.txt create mode 100644 testdata/source/generic_importer/test_training_examples/import_results.beancount create mode 100644 testdata/source/generic_importer/test_training_examples/journal.beancount create mode 100644 testdata/source/generic_importer/test_training_examples/training_examples.json diff --git a/beancount_import/source/generic_importer_source.py b/beancount_import/source/generic_importer_source.py index 76c6427d..d7bde273 100644 --- a/beancount_import/source/generic_importer_source.py +++ b/beancount_import/source/generic_importer_source.py @@ -39,19 +39,19 @@ def __init__(self, directory: str, account: str, importer: ImporterProtocol, - account_name:str = None, **kwargs) -> None: super().__init__(**kwargs) self.directory = os.path.expanduser(directory) self.importer = importer self.account = account - self.account_name = account_name if account_name else self.name self._comparator = SimilarityComparator() # get _FileMemo object for each file files = [get_file(f) for f in + filter(os.path.isfile, glob(os.path.join(directory, '**', '*'), recursive=True) + ) ] # filter the valid files for this importer self.files = [f for f in files if self.importer.identify(f)] @@ -64,7 +64,7 @@ def prepare(self, journal: 'JournalEditor', results: SourceResults) -> None: results.add_account(self.account) entries = defaultdict(list) for f in self.files: - f_entries = self.importer.extract(f) + f_entries = self.importer.extract(f, existing_entries=journal.entries) # collect all entries in current statement, grouped by hash hashed_entries = defaultdict(list) for entry in f_entries: @@ -109,6 +109,7 @@ def _add_description(self, entry: Transaction): postings = entry.postings #type: ignore to_mutate = [] for i, posting in enumerate(postings): + if posting.account != self.account: continue if isinstance(posting.meta, dict): posting.meta["source_desc"] = entry.narration else: to_mutate.append(i) for i in to_mutate: @@ -182,5 +183,6 @@ def similar_entries_in_journal(entry:Transaction, source_entries:List[Directive] duplicates.append(source_entry) return duplicates + def load(spec, log_status): return ImporterSource(log_status=log_status, **spec) diff --git a/beancount_import/source/generic_importer_source_test.py b/beancount_import/source/generic_importer_source_test.py new file mode 100644 index 00000000..0ddc912d --- /dev/null +++ b/beancount_import/source/generic_importer_source_test.py @@ -0,0 +1,37 @@ +import os + +import pytest + +from .source_test import check_source_example +from beancount.ingest.importers.csv import Importer as CSVImporter, Col + +testdata_dir = os.path.realpath( + os.path.join( + os.path.dirname(__file__), '..', '..', 'testdata', 'source', 'generic_importer')) + +examples = [ + 'test_basic', + 'test_training_examples' +] + +importer = CSVImporter({Col.DATE: 'Date', + Col.NARRATION1: 'Description', + Col.AMOUNT: 'Amount', + }, + 'Assets:Bank', + 'USD', + '"Date","Description","Amount"', + ) + + +@pytest.mark.parametrize('name', examples) +def test_source(name: str): + check_source_example( + example_dir=os.path.join(testdata_dir, name), + source_spec={ + 'module': 'beancount_import.source.generic_importer_source', + 'directory': testdata_dir, + 'account': 'Assets:Bank', + 'importer': importer, + }, + replacements=[(testdata_dir, '')]) diff --git a/testdata/source/generic_importer/generic_statement.csv b/testdata/source/generic_importer/generic_statement.csv new file mode 100644 index 00000000..5d46d425 --- /dev/null +++ b/testdata/source/generic_importer/generic_statement.csv @@ -0,0 +1,7 @@ +"Date","Description","Amount" +2020-01-01,by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-,-1 +2020-01-01,by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-,-1 +2020-01-02,BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-,1 +2020-01-02,ATM-WD Some Random ATM Machine,500 +2020-01-02,BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-,1 +2020-01-05,Transfer to 1234567890123,300 diff --git a/testdata/source/generic_importer/test_basic/accounts.txt b/testdata/source/generic_importer/test_basic/accounts.txt new file mode 100644 index 00000000..255958da --- /dev/null +++ b/testdata/source/generic_importer/test_basic/accounts.txt @@ -0,0 +1 @@ +Assets:Bank diff --git a/testdata/source/generic_importer/test_basic/import_results.beancount b/testdata/source/generic_importer/test_basic/import_results.beancount new file mode 100644 index 00000000..d1f407eb --- /dev/null +++ b/testdata/source/generic_importer/test_basic/import_results.beancount @@ -0,0 +1,107 @@ +;; date: 2020-01-01 +;; info: null + +; features: [ +; { +; "amount": "-1 USD", +; "date": "2020-01-01", +; "key_value_pairs": { +; "desc": "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-" +; }, +; "source_account": "Assets:Bank" +; } +; ] +2020-01-01 * "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-" + Assets:Bank -1 USD + source_desc: "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-" + Expenses:FIXME 1 USD + +;; date: 2020-01-01 +;; info: null + +; features: [ +; { +; "amount": "-1 USD", +; "date": "2020-01-01", +; "key_value_pairs": { +; "desc": "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-" +; }, +; "source_account": "Assets:Bank" +; } +; ] +2020-01-01 * "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-" + Assets:Bank -1 USD + source_desc: "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-" + Expenses:FIXME 1 USD + +;; date: 2020-01-02 +;; info: null + +; features: [ +; { +; "amount": "1 USD", +; "date": "2020-01-02", +; "key_value_pairs": { +; "desc": "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" +; }, +; "source_account": "Assets:Bank" +; } +; ] +2020-01-02 * "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + Assets:Bank 1 USD + source_desc: "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + Expenses:FIXME -1 USD + +;; date: 2020-01-02 +;; info: null + +; features: [ +; { +; "amount": "1 USD", +; "date": "2020-01-02", +; "key_value_pairs": { +; "desc": "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" +; }, +; "source_account": "Assets:Bank" +; } +; ] +2020-01-02 * "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + Assets:Bank 1 USD + source_desc: "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + Expenses:FIXME -1 USD + +;; date: 2020-01-02 +;; info: null + +; features: [ +; { +; "amount": "500 USD", +; "date": "2020-01-02", +; "key_value_pairs": { +; "desc": "ATM-WD Some Random ATM Machine" +; }, +; "source_account": "Assets:Bank" +; } +; ] +2020-01-02 * "ATM-WD Some Random ATM Machine" + Assets:Bank 500 USD + source_desc: "ATM-WD Some Random ATM Machine" + Expenses:FIXME -500 USD + +;; date: 2020-01-05 +;; info: null + +; features: [ +; { +; "amount": "300 USD", +; "date": "2020-01-05", +; "key_value_pairs": { +; "desc": "Transfer to 1234567890123" +; }, +; "source_account": "Assets:Bank" +; } +; ] +2020-01-05 * "Transfer to 1234567890123" + Assets:Bank 300 USD + source_desc: "Transfer to 1234567890123" + Expenses:FIXME -300 USD diff --git a/testdata/source/generic_importer/test_basic/journal.beancount b/testdata/source/generic_importer/test_basic/journal.beancount new file mode 100644 index 00000000..431f1819 --- /dev/null +++ b/testdata/source/generic_importer/test_basic/journal.beancount @@ -0,0 +1 @@ +1900-01-01 open Assets:Bank diff --git a/testdata/source/generic_importer/test_basic/training_examples.json b/testdata/source/generic_importer/test_basic/training_examples.json new file mode 100644 index 00000000..0637a088 --- /dev/null +++ b/testdata/source/generic_importer/test_basic/training_examples.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/testdata/source/generic_importer/test_training_examples/accounts.txt b/testdata/source/generic_importer/test_training_examples/accounts.txt new file mode 100644 index 00000000..255958da --- /dev/null +++ b/testdata/source/generic_importer/test_training_examples/accounts.txt @@ -0,0 +1 @@ +Assets:Bank diff --git a/testdata/source/generic_importer/test_training_examples/import_results.beancount b/testdata/source/generic_importer/test_training_examples/import_results.beancount new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/testdata/source/generic_importer/test_training_examples/import_results.beancount @@ -0,0 +1 @@ + diff --git a/testdata/source/generic_importer/test_training_examples/journal.beancount b/testdata/source/generic_importer/test_training_examples/journal.beancount new file mode 100644 index 00000000..c31042f0 --- /dev/null +++ b/testdata/source/generic_importer/test_training_examples/journal.beancount @@ -0,0 +1,41 @@ +1900-01-01 open Assets:Bank +1900-01-01 open Assets:Cash +1900-01-01 open Expenses:Misc +1900-01-01 open Liabilities:JohnDoe + + +2020-01-01 * "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-" + Assets:Bank -1 USD + source_desc: "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-" + cleared: TRUE + Expenses:Misc 1 USD + +2020-01-01 * "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-" + Assets:Bank -1 USD + source_desc: "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-" + cleared: TRUE + Expenses:Misc 1 USD + +2020-01-02 * "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + Assets:Bank 1 USD + source_desc: "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + cleared: TRUE + Expenses:Misc -1 USD + +2020-01-02 * "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + Assets:Bank 1 USD + source_desc: "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + cleared: TRUE + Expenses:Misc -1 USD + +2020-01-02 * "ATM-WD Some Random ATM Machine" + Assets:Bank 500 USD + source_desc: "ATM-WD Some Random ATM Machine" + cleared: TRUE + Assets:Cash -500 USD + +2020-01-05 * "Transfer to 1234567890123" + Assets:Bank 300 USD + source_desc: "Transfer to 1234567890123" + cleared: TRUE + Liabilities:JohnDoe -300 USD diff --git a/testdata/source/generic_importer/test_training_examples/training_examples.json b/testdata/source/generic_importer/test_training_examples/training_examples.json new file mode 100644 index 00000000..cae7eff6 --- /dev/null +++ b/testdata/source/generic_importer/test_training_examples/training_examples.json @@ -0,0 +1,68 @@ +[ + [ + { + "amount": "-1 USD", + "date": "2020-01-01", + "key_value_pairs": { + "desc": "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-" + }, + "source_account": "Assets:Bank" + }, + "Expenses:Misc" + ], + [ + { + "amount": "-1 USD", + "date": "2020-01-01", + "key_value_pairs": { + "desc": "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-" + }, + "source_account": "Assets:Bank" + }, + "Expenses:Misc" + ], + [ + { + "amount": "1 USD", + "date": "2020-01-02", + "key_value_pairs": { + "desc": "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + }, + "source_account": "Assets:Bank" + }, + "Expenses:Misc" + ], + [ + { + "amount": "1 USD", + "date": "2020-01-02", + "key_value_pairs": { + "desc": "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + }, + "source_account": "Assets:Bank" + }, + "Expenses:Misc" + ], + [ + { + "amount": "500 USD", + "date": "2020-01-02", + "key_value_pairs": { + "desc": "ATM-WD Some Random ATM Machine" + }, + "source_account": "Assets:Bank" + }, + "Assets:Cash" + ], + [ + { + "amount": "300 USD", + "date": "2020-01-05", + "key_value_pairs": { + "desc": "Transfer to 1234567890123" + }, + "source_account": "Assets:Bank" + }, + "Liabilities:JohnDoe" + ] +] \ No newline at end of file From bc19ed6b434f77e5758f4baf9cb23ed5d2b25a72 Mon Sep 17 00:00:00 2001 From: Sufiyan Adhikari Date: Fri, 7 Aug 2020 16:30:11 +0530 Subject: [PATCH 08/15] Use get_pending_and_invalid_entries 3rd revamp. hashing now replaced with _get_key_from_imported_entry date added to posting.meta tests updated test_invalid added --- .../source/generic_importer_source.py | 170 ++++++++---------- .../source/generic_importer_source_test.py | 1 + .../test_basic/import_results.beancount | 18 +- .../test_invalid/accounts.txt | 1 + .../test_invalid/import_results.beancount | 1 + .../test_invalid/journal.beancount | 68 +++++++ .../test_invalid/training_examples.json | 90 ++++++++++ .../test_training_examples/journal.beancount | 6 + 8 files changed, 252 insertions(+), 103 deletions(-) create mode 100644 testdata/source/generic_importer/test_invalid/accounts.txt create mode 100644 testdata/source/generic_importer/test_invalid/import_results.beancount create mode 100644 testdata/source/generic_importer/test_invalid/journal.beancount create mode 100644 testdata/source/generic_importer/test_invalid/training_examples.json diff --git a/beancount_import/source/generic_importer_source.py b/beancount_import/source/generic_importer_source.py index d7bde273..6ac8a476 100644 --- a/beancount_import/source/generic_importer_source.py +++ b/beancount_import/source/generic_importer_source.py @@ -21,17 +21,16 @@ import datetime from beancount.core.data import Transaction, Posting, Directive -from beancount.core import data from beancount.core.amount import Amount from beancount.ingest.importer import ImporterProtocol from beancount.core.compare import hash_entry from beancount.ingest.cache import get_file -from beancount.ingest.similar import find_similar_entries, SimilarityComparator from ..matching import FIXME_ACCOUNT, SimpleInventory -from . import ImportResult, Source, SourceResults, InvalidSourceReference, AssociatedData +from . import ImportResult, SourceResults from ..journal_editor import JournalEditor -from .description_based_source import DescriptionBasedSource +from .description_based_source import DescriptionBasedSource, get_pending_and_invalid_entries +from .mint import _get_key_from_posting class ImporterSource(DescriptionBasedSource): @@ -45,8 +44,6 @@ def __init__(self, self.importer = importer self.account = account - self._comparator = SimilarityComparator() - # get _FileMemo object for each file files = [get_file(f) for f in filter(os.path.isfile, @@ -68,41 +65,23 @@ def prepare(self, journal: 'JournalEditor', results: SourceResults) -> None: # collect all entries in current statement, grouped by hash hashed_entries = defaultdict(list) for entry in f_entries: - hash_ = self._hash_entry(entry, frozenset(['filename','lineno'])) - hashed_entries[hash_].append(entry) + key_ = self._get_key_from_imported_entry(entry) + self._add_description(entry) + hashed_entries[key_].append(entry) # deduplicate across statements - for hash_ in hashed_entries: + for key_ in hashed_entries: # skip the existing entries from other statements. add remaining - n = len(entries[hash_]) - entries[hash_].extend(hashed_entries[hash_][n:]) - - uncleared_entries = defaultdict(list) - for hash_ in entries: - # number of matching cleared entries in journal - n = len(similar_entries_in_journal(entries[hash_][0], - journal.entries, - self.comparator)) - # If journal has n cleared entries for this hash, pick remaining - for entry in entries[hash_][n:]: - # add importer name as sorce description to source postings - self._add_description(entry) - # balance amount - self.balance_amounts(entry) - uncleared_entries[hash_].append(entry) - - results.add_pending_entries( - [ImportResult(entry.date, [entry], None) - for entry in itertools.chain.from_iterable(uncleared_entries.values()) - ] - ) - - def comparator(self, entry1, entry2): - """Returns if the two entries are similar and 2nd entry is cleared. - The first entry is from new_entries and 2nd is from journal - """ - return self._comparator(entry1, entry2) \ - and self.is_entry_cleared(entry2) \ - and entry1.narration == entry2.postings[0].meta['source_desc'] + n = len(entries[key_]) + entries[key_].extend(hashed_entries[key_][n:]) + + get_pending_and_invalid_entries( + raw_entries=list(itertools.chain.from_iterable(entries.values())), + journal_entries=journal.all_entries, + account_set=set([self.account]), + get_key_from_posting=_get_key_from_posting, + get_key_from_raw_entry=self._get_key_from_imported_entry, + make_import_result=self._make_import_result, + results=results) def _add_description(self, entry: Transaction): if not isinstance(entry, Transaction): return None @@ -110,48 +89,22 @@ def _add_description(self, entry: Transaction): to_mutate = [] for i, posting in enumerate(postings): if posting.account != self.account: continue - if isinstance(posting.meta, dict): posting.meta["source_desc"] = entry.narration - else: to_mutate.append(i) + if isinstance(posting.meta, dict): + posting.meta["source_desc"] = entry.narration + posting.meta["date"] = entry.date + break + else: + to_mutate.append(i) + break for i in to_mutate: p = postings.pop(i) - p = Posting(p.account, p.units, p.cost, p.price, p.flag, {"source_desc":entry.narration}) + p = Posting(p.account, p.units, p.cost, p.price, p.flag, + {"source_desc":entry.narration, "date": entry.date}) postings.insert(i, p) - @staticmethod - def balance_amounts(txn:Transaction)-> None: - """Add FIXME account for the remaing amount to balance accounts""" - inventory = SimpleInventory() - for posting in txn.postings: - inventory += posting.units - for currency in inventory: - txn.postings.append( - Posting( - account=FIXME_ACCOUNT, - units=Amount(currency=currency, number=-inventory[currency]), - cost=None, - price=None, - flag=None, - meta={}, - )) - - @staticmethod - def _hash_entry(entry:Directive, exclude_meta_keys=frozenset()) -> str: - """Similar to beancount.core.compare.hash_entry but can skip selective meta fields - the meta fields to be used for hashing should be in Transaction's meta, not Posting's meta - """ - if not isinstance(entry, Transaction): return hash_entry(entry) - h = hashlib.md5() - h.update(hash_entry(entry, exclude_meta=True).encode()) - for key in entry.meta: - if key in exclude_meta_keys: continue - h.update(str(entry.meta[key]).encode()) - return h.hexdigest() - - def is_entry_cleared(self, entry: Transaction) -> bool: - """If an entry has a cleared posting, it is considered cleared""" + def _get_source_posting(self, entry:Transaction): for posting in entry.postings: - if self.is_posting_cleared(posting): return True - return False + if posting.account == self.account: return posting def is_posting_cleared(self, posting: Posting) -> bool: """Given than this source is athoritative of the account of a particular posting, @@ -161,27 +114,50 @@ def is_posting_cleared(self, posting: Posting) -> bool: if posting.account != self.account: return False return super().is_posting_cleared(posting) -def similar_entries_in_journal(entry:Transaction, source_entries:List[Directive], - comparator=None, window_days=2) -> List[Transaction]: - """Given a hashed entry, find the similar entries in the journal - This is a rewrite of beancount.ingest.similar.find_similar_entries - to get all possible matches for a single new entry - """ - window_head = datetime.timedelta(days=window_days) - window_tail = datetime.timedelta(days=window_days + 1) - - if comparator is None: - comparator = SimilarityComparator() - - # Look at existing entries at a nearby date. - duplicates = [] - for source_entry in data.filter_txns( - data.iter_entry_dates(source_entries, - entry.date - window_head, - entry.date + window_tail)): - if comparator(entry, source_entry): - duplicates.append(source_entry) - return duplicates + def _get_key_from_imported_entry(self, entry:Transaction): + return (self.account, + entry.date, + self._get_source_posting(entry).units, + entry.narration) + + def _make_import_result(self, imported_entry:Directive): + if isinstance(imported_entry, Transaction): balance_amounts(imported_entry) + result = ImportResult( + date=imported_entry.date, info=get_info(imported_entry), entries=[imported_entry]) + # delete filename since it is used by beancount-import to determine if the + # entry is from journal. + imported_entry.meta.pop('filename') + return result + +def _get_key_from_posting(entry: Transaction, posting: Posting, + source_postings: List[Posting], source_desc: str, + posting_date: datetime.date): + del entry + del source_postings + return (posting.account, posting_date, posting.units, source_desc) + +def get_info(raw_entry: Directive) -> dict: + return dict( + type=get_file(raw_entry.meta['filename']).mimetype(), + filename=raw_entry.meta['filename'], + line=raw_entry.meta['lineno'], + ) + +def balance_amounts(txn:Transaction)-> None: + """Add FIXME account for the remaing amount to balance accounts""" + inventory = SimpleInventory() + for posting in txn.postings: + inventory += posting.units + for currency in inventory: + txn.postings.append( + Posting( + account=FIXME_ACCOUNT, + units=Amount(currency=currency, number=-inventory[currency]), + cost=None, + price=None, + flag=None, + meta={}, + )) def load(spec, log_status): diff --git a/beancount_import/source/generic_importer_source_test.py b/beancount_import/source/generic_importer_source_test.py index 0ddc912d..087a3543 100644 --- a/beancount_import/source/generic_importer_source_test.py +++ b/beancount_import/source/generic_importer_source_test.py @@ -11,6 +11,7 @@ examples = [ 'test_basic', + 'test_invalid', 'test_training_examples' ] diff --git a/testdata/source/generic_importer/test_basic/import_results.beancount b/testdata/source/generic_importer/test_basic/import_results.beancount index d1f407eb..1912e28a 100644 --- a/testdata/source/generic_importer/test_basic/import_results.beancount +++ b/testdata/source/generic_importer/test_basic/import_results.beancount @@ -1,5 +1,5 @@ ;; date: 2020-01-01 -;; info: null +;; info: {"filename": "/generic_statement.csv", "line": 1, "type": "text/csv"} ; features: [ ; { @@ -13,11 +13,12 @@ ; ] 2020-01-01 * "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-" Assets:Bank -1 USD + date: 2020-01-01 source_desc: "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-" Expenses:FIXME 1 USD ;; date: 2020-01-01 -;; info: null +;; info: {"filename": "/generic_statement.csv", "line": 2, "type": "text/csv"} ; features: [ ; { @@ -31,11 +32,12 @@ ; ] 2020-01-01 * "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-" Assets:Bank -1 USD + date: 2020-01-01 source_desc: "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-" Expenses:FIXME 1 USD ;; date: 2020-01-02 -;; info: null +;; info: {"filename": "/generic_statement.csv", "line": 3, "type": "text/csv"} ; features: [ ; { @@ -49,11 +51,12 @@ ; ] 2020-01-02 * "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" Assets:Bank 1 USD + date: 2020-01-02 source_desc: "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" Expenses:FIXME -1 USD ;; date: 2020-01-02 -;; info: null +;; info: {"filename": "/generic_statement.csv", "line": 5, "type": "text/csv"} ; features: [ ; { @@ -67,11 +70,12 @@ ; ] 2020-01-02 * "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" Assets:Bank 1 USD + date: 2020-01-02 source_desc: "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" Expenses:FIXME -1 USD ;; date: 2020-01-02 -;; info: null +;; info: {"filename": "/generic_statement.csv", "line": 4, "type": "text/csv"} ; features: [ ; { @@ -85,11 +89,12 @@ ; ] 2020-01-02 * "ATM-WD Some Random ATM Machine" Assets:Bank 500 USD + date: 2020-01-02 source_desc: "ATM-WD Some Random ATM Machine" Expenses:FIXME -500 USD ;; date: 2020-01-05 -;; info: null +;; info: {"filename": "/generic_statement.csv", "line": 6, "type": "text/csv"} ; features: [ ; { @@ -103,5 +108,6 @@ ; ] 2020-01-05 * "Transfer to 1234567890123" Assets:Bank 300 USD + date: 2020-01-05 source_desc: "Transfer to 1234567890123" Expenses:FIXME -300 USD diff --git a/testdata/source/generic_importer/test_invalid/accounts.txt b/testdata/source/generic_importer/test_invalid/accounts.txt new file mode 100644 index 00000000..255958da --- /dev/null +++ b/testdata/source/generic_importer/test_invalid/accounts.txt @@ -0,0 +1 @@ +Assets:Bank diff --git a/testdata/source/generic_importer/test_invalid/import_results.beancount b/testdata/source/generic_importer/test_invalid/import_results.beancount new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/testdata/source/generic_importer/test_invalid/import_results.beancount @@ -0,0 +1 @@ + diff --git a/testdata/source/generic_importer/test_invalid/journal.beancount b/testdata/source/generic_importer/test_invalid/journal.beancount new file mode 100644 index 00000000..2fa032d8 --- /dev/null +++ b/testdata/source/generic_importer/test_invalid/journal.beancount @@ -0,0 +1,68 @@ +1900-01-01 open Assets:Bank +1900-01-01 open Assets:Cash +1900-01-01 open Expenses:Misc +1900-01-01 open Liabilities:JohnDoe + + +2020-01-01 * "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-" + Assets:Bank -1 USD + source_desc: "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-" + count: 1 + date: 2020-01-01 + cleared: TRUE + invalid0: "1 extra" + Expenses:Misc 1 USD + +2020-01-01 * "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-" + Assets:Bank -1 USD + source_desc: "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-" + count: 2 + date: 2020-01-01 + cleared: TRUE + invalid0: "1 extra" + Expenses:Misc 1 USD + +2020-01-01 * "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-" + Assets:Bank -1 USD + source_desc: "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-" + count: 3 + date: 2020-01-01 + cleared: TRUE + Expenses:Misc 1 USD + +2020-01-02 * "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + Assets:Bank 1 USD + source_desc: "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + date: 2020-01-02 + cleared: TRUE + Expenses:Misc -1 USD + +2020-01-02 * "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + Assets:Bank 1 USD + source_desc: "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + date: 2020-01-02 + cleared: TRUE + Expenses:Misc -1 USD + +2020-01-02 * "ATM-WD Some Random ATM Machine" + Assets:Bank 500 USD + source_desc: "ATM-WD Some Random ATM Machine" + date: 2020-01-02 + cleared: TRUE + Assets:Cash -500 USD + +2020-01-05 * "Transfer to 1234567890123" + Assets:Bank 300 USD + source_desc: "Transfer to 1234567890123" + date: 2020-01-05 + cleared: TRUE + Liabilities:JohnDoe -300 USD + +2020-01-06 * "Transfer to 1234567890321" + info: "doesn't exist in statement hence invalid" + Assets:Bank 111.11 USD + source_desc: "Transfer to 1234567890123" + date: 2020-01-05 + cleared: TRUE + invalid1: "1 extra" + Liabilities:JohnDoe -111.11 USD diff --git a/testdata/source/generic_importer/test_invalid/training_examples.json b/testdata/source/generic_importer/test_invalid/training_examples.json new file mode 100644 index 00000000..d0e7c237 --- /dev/null +++ b/testdata/source/generic_importer/test_invalid/training_examples.json @@ -0,0 +1,90 @@ +[ + [ + { + "amount": "-1 USD", + "date": "2020-01-01", + "key_value_pairs": { + "desc": "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-" + }, + "source_account": "Assets:Bank" + }, + "Expenses:Misc" + ], + [ + { + "amount": "-1 USD", + "date": "2020-01-01", + "key_value_pairs": { + "desc": "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-" + }, + "source_account": "Assets:Bank" + }, + "Expenses:Misc" + ], + [ + { + "amount": "-1 USD", + "date": "2020-01-01", + "key_value_pairs": { + "desc": "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-" + }, + "source_account": "Assets:Bank" + }, + "Expenses:Misc" + ], + [ + { + "amount": "1 USD", + "date": "2020-01-02", + "key_value_pairs": { + "desc": "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + }, + "source_account": "Assets:Bank" + }, + "Expenses:Misc" + ], + [ + { + "amount": "1 USD", + "date": "2020-01-02", + "key_value_pairs": { + "desc": "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + }, + "source_account": "Assets:Bank" + }, + "Expenses:Misc" + ], + [ + { + "amount": "500 USD", + "date": "2020-01-02", + "key_value_pairs": { + "desc": "ATM-WD Some Random ATM Machine" + }, + "source_account": "Assets:Bank" + }, + "Assets:Cash" + ], + [ + { + "amount": "300 USD", + "date": "2020-01-05", + "key_value_pairs": { + "desc": "Transfer to 1234567890123" + }, + "source_account": "Assets:Bank" + }, + "Liabilities:JohnDoe" + ], + [ + { + "amount": "111.11 USD", + "date": "2020-01-05", + "key_value_pairs": { + "desc": "Transfer to 1234567890123" + }, + "source_account": "Assets:Bank" + }, + "Liabilities:JohnDoe" + ] +] \ No newline at end of file diff --git a/testdata/source/generic_importer/test_training_examples/journal.beancount b/testdata/source/generic_importer/test_training_examples/journal.beancount index c31042f0..0db95193 100644 --- a/testdata/source/generic_importer/test_training_examples/journal.beancount +++ b/testdata/source/generic_importer/test_training_examples/journal.beancount @@ -7,35 +7,41 @@ 2020-01-01 * "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-" Assets:Bank -1 USD source_desc: "by debit card-OTHPG 063441 GOOGLE CLOUD INDIA PVTTHANE-" + date: 2020-01-01 cleared: TRUE Expenses:Misc 1 USD 2020-01-01 * "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-" Assets:Bank -1 USD source_desc: "by debit card-OTHPG 063444 GOOGLE CLOUD INDIA PVTTHANE-" + date: 2020-01-01 cleared: TRUE Expenses:Misc 1 USD 2020-01-02 * "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" Assets:Bank 1 USD source_desc: "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + date: 2020-01-02 cleared: TRUE Expenses:Misc -1 USD 2020-01-02 * "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" Assets:Bank 1 USD source_desc: "BULK POSTING- 00000008237 250120 GOOGLE CLOUD INDIA PVT-" + date: 2020-01-02 cleared: TRUE Expenses:Misc -1 USD 2020-01-02 * "ATM-WD Some Random ATM Machine" Assets:Bank 500 USD source_desc: "ATM-WD Some Random ATM Machine" + date: 2020-01-02 cleared: TRUE Assets:Cash -500 USD 2020-01-05 * "Transfer to 1234567890123" Assets:Bank 300 USD source_desc: "Transfer to 1234567890123" + date: 2020-01-05 cleared: TRUE Liabilities:JohnDoe -300 USD From 678424e7fad400df18fb76a631c15eaaf7b1ec47 Mon Sep 17 00:00:00 2001 From: Sufiyan Adhikari Date: Fri, 7 Aug 2020 16:38:38 +0530 Subject: [PATCH 09/15] Use inheritted is_posting_cleared --- beancount_import/source/generic_importer_source.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/beancount_import/source/generic_importer_source.py b/beancount_import/source/generic_importer_source.py index 6ac8a476..2398e5bc 100644 --- a/beancount_import/source/generic_importer_source.py +++ b/beancount_import/source/generic_importer_source.py @@ -106,14 +106,6 @@ def _get_source_posting(self, entry:Transaction): for posting in entry.postings: if posting.account == self.account: return posting - def is_posting_cleared(self, posting: Posting) -> bool: - """Given than this source is athoritative of the account of a particular posting, - return if that posting is cleared. - All postings which have `source_desc` meta key are considered cleared - """ - if posting.account != self.account: return False - return super().is_posting_cleared(posting) - def _get_key_from_imported_entry(self, entry:Transaction): return (self.account, entry.date, From c3f755908b1bdb92282901c04eab2c24ca73ee8c Mon Sep 17 00:00:00 2001 From: Sufiyan Adhikari Date: Fri, 7 Aug 2020 23:59:48 +0530 Subject: [PATCH 10/15] Fix mypy errors --- .../source/generic_importer_source.py | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/beancount_import/source/generic_importer_source.py b/beancount_import/source/generic_importer_source.py index 2398e5bc..b4e4ec39 100644 --- a/beancount_import/source/generic_importer_source.py +++ b/beancount_import/source/generic_importer_source.py @@ -13,17 +13,14 @@ """ import os -import hashlib from glob import glob -from typing import List from collections import defaultdict import itertools -import datetime +from typing import Hashable, List, Dict, Optional from beancount.core.data import Transaction, Posting, Directive from beancount.core.amount import Amount from beancount.ingest.importer import ImporterProtocol -from beancount.core.compare import hash_entry from beancount.ingest.cache import get_file from ..matching import FIXME_ACCOUNT, SimpleInventory @@ -54,12 +51,13 @@ def __init__(self, self.files = [f for f in files if self.importer.identify(f)] @property - def name(self): + def name(self) -> str: return self.importer.name() def prepare(self, journal: 'JournalEditor', results: SourceResults) -> None: results.add_account(self.account) - entries = defaultdict(list) + + entries:Dict[Hashable,List[Directive]] = defaultdict(list) for f in self.files: f_entries = self.importer.extract(f, existing_entries=journal.entries) # collect all entries in current statement, grouped by hash @@ -85,7 +83,7 @@ def prepare(self, journal: 'JournalEditor', results: SourceResults) -> None: def _add_description(self, entry: Transaction): if not isinstance(entry, Transaction): return None - postings = entry.postings #type: ignore + postings: List[Posting] = entry.postings to_mutate = [] for i, posting in enumerate(postings): if posting.account != self.account: continue @@ -102,14 +100,18 @@ def _add_description(self, entry: Transaction): {"source_desc":entry.narration, "date": entry.date}) postings.insert(i, p) - def _get_source_posting(self, entry:Transaction): + def _get_source_posting(self, entry:Transaction) -> Optional[Posting]: for posting in entry.postings: - if posting.account == self.account: return posting + if posting.account == self.account: + return posting - def _get_key_from_imported_entry(self, entry:Transaction): + def _get_key_from_imported_entry(self, entry:Transaction) -> Hashable: + source_posting = self._get_source_posting(entry) + if source_posting is None: + raise ValueError("entry has no postings for {self.account}") return (self.account, entry.date, - self._get_source_posting(entry).units, + source_posting.units, entry.narration) def _make_import_result(self, imported_entry:Directive): @@ -121,12 +123,6 @@ def _make_import_result(self, imported_entry:Directive): imported_entry.meta.pop('filename') return result -def _get_key_from_posting(entry: Transaction, posting: Posting, - source_postings: List[Posting], source_desc: str, - posting_date: datetime.date): - del entry - del source_postings - return (posting.account, posting_date, posting.units, source_desc) def get_info(raw_entry: Directive) -> dict: return dict( From cbbe06d7b6a5400b9825af6b68fc51a617a1543a Mon Sep 17 00:00:00 2001 From: Sufiyan Adhikari Date: Sat, 8 Aug 2020 00:07:03 +0530 Subject: [PATCH 11/15] =?UTF-8?q?return=20None=20explicitly=20because=20my?= =?UTF-8?q?py=20=F0=9F=98=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- beancount_import/source/generic_importer_source.py | 1 + 1 file changed, 1 insertion(+) diff --git a/beancount_import/source/generic_importer_source.py b/beancount_import/source/generic_importer_source.py index b4e4ec39..971fbc60 100644 --- a/beancount_import/source/generic_importer_source.py +++ b/beancount_import/source/generic_importer_source.py @@ -104,6 +104,7 @@ def _get_source_posting(self, entry:Transaction) -> Optional[Posting]: for posting in entry.postings: if posting.account == self.account: return posting + return None def _get_key_from_imported_entry(self, entry:Transaction) -> Hashable: source_posting = self._get_source_posting(entry) From 6b79380da719c6af520688f96fba80db602098a7 Mon Sep 17 00:00:00 2001 From: Sufiyan Adhikari Date: Sat, 8 Aug 2020 00:09:36 +0530 Subject: [PATCH 12/15] add f to fstring --- beancount_import/source/generic_importer_source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beancount_import/source/generic_importer_source.py b/beancount_import/source/generic_importer_source.py index 971fbc60..0c194a1b 100644 --- a/beancount_import/source/generic_importer_source.py +++ b/beancount_import/source/generic_importer_source.py @@ -109,7 +109,7 @@ def _get_source_posting(self, entry:Transaction) -> Optional[Posting]: def _get_key_from_imported_entry(self, entry:Transaction) -> Hashable: source_posting = self._get_source_posting(entry) if source_posting is None: - raise ValueError("entry has no postings for {self.account}") + raise ValueError(f"entry has no postings for {self.account}") return (self.account, entry.date, source_posting.units, From de8f73aebb71cb95d489b842334170b7f7f668ae Mon Sep 17 00:00:00 2001 From: Sufiyan Adhikari Date: Sat, 8 Aug 2020 00:14:11 +0530 Subject: [PATCH 13/15] pre python3.6 typehinting style --- beancount_import/source/generic_importer_source.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/beancount_import/source/generic_importer_source.py b/beancount_import/source/generic_importer_source.py index 0c194a1b..41676cda 100644 --- a/beancount_import/source/generic_importer_source.py +++ b/beancount_import/source/generic_importer_source.py @@ -57,7 +57,7 @@ def name(self) -> str: def prepare(self, journal: 'JournalEditor', results: SourceResults) -> None: results.add_account(self.account) - entries:Dict[Hashable,List[Directive]] = defaultdict(list) + entries = defaultdict(list) #type: Dict[Hashable,List[Directive]] for f in self.files: f_entries = self.importer.extract(f, existing_entries=journal.entries) # collect all entries in current statement, grouped by hash @@ -83,7 +83,7 @@ def prepare(self, journal: 'JournalEditor', results: SourceResults) -> None: def _add_description(self, entry: Transaction): if not isinstance(entry, Transaction): return None - postings: List[Posting] = entry.postings + postings = entry.postings #type: List[Posting] to_mutate = [] for i, posting in enumerate(postings): if posting.account != self.account: continue From bf414dd96b23c9595b2bd3521ccd9840aae61efd Mon Sep 17 00:00:00 2001 From: Sufiyan Adhikari Date: Sat, 8 Aug 2020 00:20:03 +0530 Subject: [PATCH 14/15] remove fstring for python 3.5 support --- beancount_import/source/generic_importer_source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beancount_import/source/generic_importer_source.py b/beancount_import/source/generic_importer_source.py index 41676cda..a26ae755 100644 --- a/beancount_import/source/generic_importer_source.py +++ b/beancount_import/source/generic_importer_source.py @@ -109,7 +109,7 @@ def _get_source_posting(self, entry:Transaction) -> Optional[Posting]: def _get_key_from_imported_entry(self, entry:Transaction) -> Hashable: source_posting = self._get_source_posting(entry) if source_posting is None: - raise ValueError(f"entry has no postings for {self.account}") + raise ValueError("entry has no postings for account: {}".format(self.account)) return (self.account, entry.date, source_posting.units, From 7450bce543a9ac268b6bc5e97b557c9596823aed Mon Sep 17 00:00:00 2001 From: Sufiyan Adhikari Date: Sat, 8 Aug 2020 01:03:17 +0530 Subject: [PATCH 15/15] Use OrderedDict instead of defaultdict --- .../source/generic_importer_source.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/beancount_import/source/generic_importer_source.py b/beancount_import/source/generic_importer_source.py index a26ae755..da0e458c 100644 --- a/beancount_import/source/generic_importer_source.py +++ b/beancount_import/source/generic_importer_source.py @@ -14,7 +14,7 @@ import os from glob import glob -from collections import defaultdict +from collections import OrderedDict import itertools from typing import Hashable, List, Dict, Optional @@ -57,20 +57,23 @@ def name(self) -> str: def prepare(self, journal: 'JournalEditor', results: SourceResults) -> None: results.add_account(self.account) - entries = defaultdict(list) #type: Dict[Hashable,List[Directive]] + entries = OrderedDict() #type: Dict[Hashable, List[Directive]] for f in self.files: f_entries = self.importer.extract(f, existing_entries=journal.entries) # collect all entries in current statement, grouped by hash - hashed_entries = defaultdict(list) + hashed_entries = OrderedDict() #type: Dict[Hashable, Directive] for entry in f_entries: key_ = self._get_key_from_imported_entry(entry) self._add_description(entry) - hashed_entries[key_].append(entry) + hashed_entries.setdefault(key_, []).append(entry) # deduplicate across statements for key_ in hashed_entries: # skip the existing entries from other statements. add remaining - n = len(entries[key_]) - entries[key_].extend(hashed_entries[key_][n:]) + if not key_ in entries: + n = 0 + else: + n = len(entries[key_]) + entries.setdefault(key_, []).extend(hashed_entries[key_][n:]) get_pending_and_invalid_entries( raw_entries=list(itertools.chain.from_iterable(entries.values())),