Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make generic importer more flexible #234

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 19 additions & 14 deletions beancount_import/source/generic_importer_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@
class ImporterSource(DescriptionBasedSource):
def __init__(self,
directory: str,
account: str,
importer: ImporterProtocol,
# use None for importers that are not authoritative and would not clear any postings
account: Optional[str]=None,
**kwargs) -> None:
super().__init__(**kwargs)
self.directory = os.path.expanduser(directory)
Expand All @@ -57,11 +58,16 @@ def name(self) -> str:
return self.importer.name()

def prepare(self, journal: 'JournalEditor', results: SourceResults) -> None:
results.add_account(self.account)
if self.account:
results.add_account(self.account)

entries = OrderedDict() #type: Dict[Hashable, List[Directive]]
for f in self.files:
f_entries = self.importer.extract(f, existing_entries=journal.entries)
# if the importer is not authoritative, add all entries to pending
if not self.account:
results.add_pending_entries(map(self._make_import_result, f_entries))
continue
# collect all entries in current statement, grouped by hash
hashed_entries = OrderedDict() #type: Dict[Hashable, Directive]
for entry in f_entries:
Expand All @@ -77,14 +83,15 @@ def prepare(self, journal: 'JournalEditor', results: SourceResults) -> None:
n = len(entries[key_])
entries.setdefault(key_, []).extend(hashed_entries[key_][n:])

get_pending_and_invalid_entries(
raw_entries=list(itertools.chain.from_iterable(entries.values())),
journal_entries=journal.all_entries,
account_set=set([self.account]),
get_key_from_posting=_get_key_from_posting,
get_key_from_raw_entry=self._get_key_from_imported_entry,
make_import_result=self._make_import_result,
results=results)
if self.account:
get_pending_and_invalid_entries(
raw_entries=list(itertools.chain.from_iterable(entries.values())),
journal_entries=journal.all_entries,
account_set=set([self.account]),
get_key_from_posting=_get_key_from_posting,
get_key_from_raw_entry=self._get_key_from_imported_entry,
make_import_result=self._make_import_result,
results=results)

def _add_description(self, entry: Transaction):
if not isinstance(entry, Transaction): return None
Expand All @@ -93,12 +100,10 @@ def _add_description(self, entry: Transaction):
for i, posting in enumerate(postings):
if posting.account != self.account: continue
if isinstance(posting.meta, dict):
posting.meta["source_desc"] = entry.narration
posting.meta["date"] = entry.date
break
posting.meta.setdefault("source_desc", entry.narration)
posting.meta.setdefault("date", entry.date)
else:
to_mutate.append(i)
break
for i in to_mutate:
p = postings.pop(i)
p = Posting(p.account, p.units, p.cost, p.price, p.flag,
Expand Down
5 changes: 5 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,8 @@ Examples:
- `fresh`: Example of importing transactions starting with an empty journal.
- `manually_entered`: Example of importing transactions corresponding to
existing, manually-entered transactions.
- `multiple_imports`: Example of importing same transactions from multiple
importers, eg. you receive transaction emails same day while the monthly
statement is received at the end of the month. here, the transaction is
imported from email but not cleared (by setting `account=None` in run.py)
and is cleared only at the end of the month by monthly statement.
4 changes: 0 additions & 4 deletions examples/data/importers/bank.csv
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
"Date","Description","Amount"
2020-01-01,by debit card-OTHPG 063441 GOOGLE CLOUD,-1
2020-01-01,by debit card-OTHPG 063444 GOOGLE CLOUD,-1
2020-01-02,BULK POSTING- 00000008237 250120 GOOGLE,1
2020-01-02,ATM-WD Some Random ATM Machine,-500
2020-01-02,BULK POSTING- 00000008237 250120 GOOGLE,1
2020-01-05,Transfer to 1234567890123,300
2020-01-14,Transfer to Amex 431145642232,-30
30 changes: 30 additions & 0 deletions examples/data/importers/single_transaction_email.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
<!DOCTYPE html>
<html>
<head>
<title>Tables Example</title>
</head>
<body>
<b>FooBar Bank Transaction Alert</b>
<table>
<tr>
<th>Account</th>
</tr>
<tr>
<td>********9876</td>
</tr>
</table>
<br>
<table>
<tr>
<th>Date</th>
<th>Description</th>
<th>Amount</th>
</tr>
<tr>
<td>2020-01-14</td>
<td>Cleared Credit Card Bill</td>
<td>-30.00</td>
</tr>
</table>
</body>
</html>
5 changes: 5 additions & 0 deletions examples/multiple_imports/accounts.beancount
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1900-01-01 open Assets:FooBarBank EUR

1900-01-01 open Liabilities:Amex-Credit-Card EUR

2020-01-14 open Expenses:Misc EUR
46 changes: 46 additions & 0 deletions examples/multiple_imports/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
This config is where you would initialize your importers with personal info
like account number or credit card last4 digit.

you may also define CONFIG:List[ImporterProtocol] for other beancount tools like
bean-identify, bean-file, and other beancount scripts to use
eg. `bean-identify _config.py ~/Downloads`
to identify the files that importers defined here can process

beancount-import should have it's own run.py where you invoke the
`beancount_import.webserver.main` but import the Importer objects from this config
"""
from beancount.ingest.importers.csv import Importer as CSVImporter, Col
from foo_bar_email_importer import FooBarTransactionEmailImporter

my_foobar_bank_importer = CSVImporter({
Col.DATE: 'Date',
Col.NARRATION1: 'Description',
Col.AMOUNT: 'Amount',
},
'Assets:FooBarBank', # account
'EUR', # currency
# regexps used by ImporterProtocol.identify() to identify the correct file
'"Date","Description","Amount"',
)

foobar_email_importer = FooBarTransactionEmailImporter(filing_account='Assets:FooBarBank')


my_amex_cc_importer = CSVImporter({
Col.DATE: 'Date',
Col.NARRATION1: 'Description',
Col.AMOUNT: 'Amount',
Col.BALANCE:'Balance'
},
'Liabilities:Amex-Credit-Card', # account
'EUR', # currency
# regexps used by ImporterProtocol.identify() to identify the correct file
('Date,Description,Amount,Balance',
'Credit.*7890'
),
skip_lines=1
)

# beancount's scripts use this
CONFIG = [my_foobar_bank_importer, foobar_email_importer, my_amex_cc_importer]
53 changes: 53 additions & 0 deletions examples/multiple_imports/foo_bar_email_importer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""
Imports a single transaction from transaction email received.
The same transaction would also exist in monthly csv statement.
so this importer does not clear the transaction,
by setting `self.account=None`
"""

import re
from beancount.ingest import importer
from beancount.core import data, flags
from pathlib import Path
from dateutil.parser import parse as date_parse


class FooBarTransactionEmailImporter(importer.ImporterProtocol):
def __init__(self, filing_account='Assets:FooBarBank'):
self._filing_account = filing_account
self.account = None

def identify(self, f):
return (
f.name.endswith(".html")
and re.search(r"FooBar Bank Transaction Alert", Path(f.name).read_text())
is not None
)

def extract(self, f, existing_entries=None):
pattern = r"<tr>\s*<th>Date</th>\s*<th>Description</th>\s*<th>Amount</th>\s*</tr>\s*<tr>\s*<td>(?P<DATE>.*)</td>\s*<td>(?P<DESCRIPTION>.*)</td>\s*<td>(?P<AMOUNT>.*)</td>\s*</tr>"
match = re.search(pattern, Path(f.name).read_text())
if not match:
return []
groups = match.groupdict()
txn = data.Transaction(
meta=data.new_metadata(f.name, 0),
date=date_parse(groups["DATE"]).date(),
flag=flags.FLAG_OKAY,
payee=None,
narration=groups["DESCRIPTION"],
tags=set(),
links=set(),
postings=[
data.Posting(
account=self._filing_account,
units= data.Amount(data.D(groups["AMOUNT"]), "EUR"),
cost=None,
price=None,
flag=None,
meta={},
)
],
)
# returns the single transaction imported from the transaction email
return [txn]
Empty file.
3 changes: 3 additions & 0 deletions examples/multiple_imports/journal.beancount
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
include "accounts.beancount"
include "transactions.beancount"
include "prices.beancount"
Empty file.
62 changes: 62 additions & 0 deletions examples/multiple_imports/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env python3

import glob
import os
import json
import sys

from config import my_foobar_bank_importer, my_amex_cc_importer, foobar_email_importer


def run_reconcile(extra_args):
import beancount_import.webserver

journal_dir = os.path.dirname(__file__)
data_dir = os.path.join(os.path.dirname(__file__), "..", "data")

data_sources = [
dict(
module="beancount_import.source.generic_importer_source",
# imports monthly bank statements
importer=my_foobar_bank_importer,
account="Assets:FooBarBank",
directory=os.path.join(data_dir, "importers"),
),
dict(
module="beancount_import.source.generic_importer_source",
# imports individual transactions from email
importer=foobar_email_importer,
# this importer just imports transactions from email
# but does not clear the postings, hence account=None
# note than the importer just above this one clears the postings
# imported by this importer
account=None,
directory=os.path.join(data_dir, "importers"),
),
dict(
module="beancount_import.source.generic_importer_source",
# imports monthly credit card statements
importer=my_amex_cc_importer,
account="Liabilities:Amex-Credit-Card",
directory=os.path.join(data_dir, "importers"),
),
]

beancount_import.webserver.main(
extra_args,
journal_input=os.path.join(journal_dir, "journal.beancount"),
ignored_journal=os.path.join(journal_dir, "ignored.beancount"),
default_output=os.path.join(journal_dir, "transactions.beancount"),
open_account_output_map=[
(".*", os.path.join(journal_dir, "accounts.beancount")),
],
balance_account_output_map=[
(".*", os.path.join(journal_dir, "accounts.beancount")),
],
price_output=os.path.join(journal_dir, "prices.beancount"),
data_sources=data_sources,
)


if __name__ == "__main__":
run_reconcile(sys.argv[1:])
Empty file.
Loading