diff --git a/.gitignore b/.gitignore index 894a44c..fb9b363 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,8 @@ venv.bak/ # mypy .mypy_cache/ + +.idea +text_payslips/*.txt +/payslips-month-columns.csv +/payslips-month-rows.csv diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..6b285e1 --- /dev/null +++ b/Pipfile @@ -0,0 +1,13 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +"pdfminer.six" = "*" +chardet = "==3.0.4" + +[dev-packages] + +[requires] +python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..6cce92c --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,83 @@ +{ + "_meta": { + "hash": { + "sha256": "8e6b425c47b2dc898efa6fbf512b89add52b62ac6a844422d6e3cadd31a79fff" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.7" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "index": "pypi", + "version": "==3.0.4" + }, + "pdfminer.six": { + "hashes": [ + "sha256:f04d029d1d3e58c87da51bdefef2e9a1dbf2d7b63f727dd2a3e36054f5ae96ea" + ], + "index": "pypi", + "version": "==20181108" + }, + "pycryptodome": { + "hashes": [ + "sha256:0281dc6a65a4d0d9e439f54e0ad5faf27bfdc2ebe9ead36912bac74a0920fa2e", + "sha256:02af9b284f5c9a55f06f5e4532c16c9b7bd958e293e93969934d864ef7bd87ee", + "sha256:09da99372fb69762e4b9690291176a166cc351793e2e1c9405d29ca291503aa8", + "sha256:0c2400ccfc049c3f24e65d4f02bb4208d86e408011019e455fab7f50d2b226c9", + "sha256:2081dd6dce6b21bf3596427edaedd4f2561dce616893b162ed2c674f3a3ca70a", + "sha256:28b86ec9fdb005a2a18e4862a3a7277046738825ee8dc89cda5657e75a396089", + "sha256:2d790c0d4c0d5edcf5fbab4e2af7b03757e40c5ae8d217f0dfe9ddea37fe130f", + "sha256:2f24906153dca16528cf5515b1afa9ef635423d5a654904e861765f88ca667b6", + "sha256:30d283939896fa4bacbdb9fa86e6fd51e9a5b953a511e210b38481f697f289f5", + "sha256:31f78b67f97830d137f74813c0502a181a03b43a32ed124049bb20428176c307", + "sha256:33c1f3a380fd38ab4dd4372bef17e98002b360b52814bb1b077693b1bd06ec87", + "sha256:34091e9a6650c44e25339f22fc821396f19f152f65be2546edd823a093fb5a04", + "sha256:567fb73951ab6865a2eb1a0060b54be1e27302574f6c65879525bdf53fab49e1", + "sha256:5bc40f8aa7ba8ca7f833ad2477b9d84e1bfd2630b22a46d9bbd221982f8c3ac0", + "sha256:6b0a0ccf33c7a6100c569667c888335a4aaf0d22218cb97b4963a65d70f6c343", + "sha256:71b93157f1ce93fc7cfff9359b76def2b4826a7ef7a7f95e070161368e7f584a", + "sha256:7d939d511b7dac29b2d936706786771ecb8256e43fade5cdb0e8bc58f02b86cf", + "sha256:7fbc5a93d52e4c51487f4648b00dc41700adb144d10fc567b05f852e76c243ad", + "sha256:9cb94b8f9c915a5d2b273d612a25a8e5d67b49543f8eb6bcec0275ac46cda421", + "sha256:a585ea1722f9731e75881d5ffcc51d11c794d244ac57e7c2a9cbb8d5ac729302", + "sha256:a6458dd7a10ae51f6fce56bdfc79bf6d3b54556237045d09e77fbda9d6d37864", + "sha256:a9fb92e948128bce0239b87c6efcf2cb1c5a703d0b41dd6835211e6fafd1c5df", + "sha256:b0b6b4ca1c53e7d6ca9f2720919f63837f05e7a5f92912a2bc29bfd03ed3b54f", + "sha256:b7d22c8d648aaa3a7ec785eda544402141eb78ac5ffbba4cbe2c3a1f52276870", + "sha256:bc9560574a868cfa2ba781b7bb0b4685b08ea251697abfc49070ffc05e1cbee6", + "sha256:c0c5a576f3f7b7de3f86889cb47eb51b59dc11db9cf1e2a0f51eb4d988010ea4", + "sha256:e1c91c2fa942a71c98a7a1f462de6dbbe82f34b9267eb8131314d97bd13bf0d4", + "sha256:ec936361ad78aa95382c313df95777795b8185aac5dd3ec5463363ea94b556fc" + ], + "version": "==3.8.2" + }, + "six": { + "hashes": [ + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + ], + "version": "==1.12.0" + }, + "sortedcontainers": { + "hashes": [ + "sha256:974e9a32f56b17c1bac2aebd9dcf197f3eb9cd30553c5852a3187ad162e1a03a", + "sha256:d9e96492dd51fae31e60837736b38fe42a187b5404c16606ff7ee7cd582d4c60" + ], + "version": "==2.1.0" + } + }, + "develop": {} +} diff --git a/README.md b/README.md index 6db7134..8a70c55 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,74 @@ # ms-uk-payslip-parser -Parser for payslips + +Simple parser for payslips issued by MS UK. + +Converts a series of your PDF payslips into a neat CSV table. + +## Installation + +- Install Python3 3.7+ and Virtualenv +- Install dependencies +``` +# create a virtualenv +mkvirtualenv payslip-parser +# switch to virtualenv +workon payslip-parser +# install dependencies +pip3 install -r requirements.txt +``` +- Or if you have `pipenv` installed: +```bash +pipenv install +``` + +## Usage + +1. Download your payslips PDF files from the portal and put them in a directory + e.g. `~/payslips` + +2. Get into your virtualenv: + + ```bash + workon payslip-parser + ``` + + or if you have `pipenv` + + ```bash + pipenv shell + ``` + +3. First, convert PDF files to text: + + ```bash + python3 to_text.py ~/payslips ./text_payslips + ``` + + Now you should see text files with your payslips content in `text_payslips` directory. + +4. Now you can parse the text files and produce CSV tables: + + ```bash + python3 parser.py ./text_payslips + ``` + + After this you will see two CSV files in this directory: + - `payslips-month-columns.csv` - each month's data is in a separate column + - `payslips-month-rows.csv` - each month's data is in a separate row + + Every payslip item label has a short prefix identifying its payslip section: + - `.m` - metadata item + - `.d.p` - payments data item + - `.d.d` - deductions data item + - `.d.t` - totals data item + - `.d.et` - employer totals data item + - `.d.ytd` - year-to-date data item + +5. Open the CSV file in your spreadsheet editor of choice or Pandas. + + +## Feedback + +Create an issue if you encounter a problem or have a suggestions. +Or ping me on Teams. + diff --git a/parser.py b/parser.py new file mode 100644 index 0000000..1c422bf --- /dev/null +++ b/parser.py @@ -0,0 +1,225 @@ +import collections +import csv +import re +import sys +from collections import OrderedDict +from pathlib import Path + +HEADER_FIELD = '.m.Pay Date' + +FIELDS_ORDER = [ + HEADER_FIELD, '.m.Pay', '.m.', + '.d.p', + '.d.d', + '.d.t', + '.d.et', + '.d.ytd', +] + +UNWANTED_FIELDS = [ + '.m.Company Name', '.m.Account', '.m.Sort Code', '.m.NI Number', '.m.NI Category', '.m.Pay Method', +] + + +def parse_amount(amount: str): + amount = amount.replace(',', '') + if amount.endswith('-'): + return -float(amount[:-1]) + else: + return float(amount) + + +def parse_metadata(metadata_text: str): + metadata = {} + for row in metadata_text.splitlines(): + if not row: + continue + _, cell1, cell2, cell3, _ = row.split('|') + for cell in [cell1, cell2, cell3]: + cell = cell.strip() + if cell: + separator_regex = r':\s+' if ':' in cell else r'\s\s+' + item, value = re.compile(separator_regex).split(cell, maxsplit=1) + metadata[item.strip()] = value.strip() + + return metadata + + +def parse_payments_table(payments_table: str): + payments = {} + deductions = {} + ytd_balances = {} + for row in payments_table.splitlines(): + row = row.strip() + if not row: + continue + _, payment, deduction, ytd_balance, _ = row.split('|') + + payment = payment.strip() + if payment: + payment_item, amount = re.compile(r'\s\s+').split(payment) + payments[payment_item] = parse_amount(amount) + + deduction = deduction.strip() + if deduction: + deduction_item, amount = re.compile(r'\s\s+').split(deduction) + deductions[deduction_item] = parse_amount(amount) + + ytd_balance = ytd_balance.strip() + if ytd_balance: + ytd_balance_item, amount = re.compile(r'\s\s+').split(ytd_balance) + ytd_balances[ytd_balance_item] = parse_amount(amount) + + return payments, deductions, ytd_balances + + +def parse_totals(totals_row: str): + totals = {} + _, payment_total, deduction_total, net_pay, _ = totals_row.split('|') + for total_value in [payment_total, deduction_total, net_pay]: + item, amount = re.compile(r':\s+').split(total_value.strip()) + totals[item] = parse_amount(amount) + return totals + + +def parse_employer_totals(employer_total_footer): + totals = {} + for row in employer_total_footer.strip().splitlines()[1:]: + row = row.strip() + if not row or row.count('|') != 4: + continue + + _, this_employer_cell, _ = row.split('|', maxsplit=2) + item, amount = re.compile(r'\s\s+').split(this_employer_cell.strip()) + totals[item] = parse_amount(amount) + return totals + + +def parse_payslip(payslip_text: str): + address, metadata, payment_data = re.compile(r"^\s+?-+$", re.MULTILINE).split(payslip_text) + + _, payment_headers, payments_table, totals_row, _, employer_total_footer = \ + re.compile(r"^\s+?-+\|$", re.MULTILINE).split(payment_data) + + metadata = parse_metadata(metadata) + payments, deductions, ytd_balances = parse_payments_table(payments_table) + totals = parse_totals(totals_row) + employer_totals = parse_employer_totals(employer_total_footer) + + data = { + 'p': payments, + 'd': deductions, + 'ytd': ytd_balances, + 't': totals, + 'et': employer_totals + } + return { + # 'address': address, + 'm': metadata, + 'd': data + } + + +def print_payslip(dd, indent=""): + for k, v in dd.items(): + if not hasattr(v, 'items'): + print(f"{k}:\n{v}") + # print(['*'] * 30) + else: + print(f"{k}:\n") + print_payslip(v, indent=indent + " ") + + +def count_fields(counts, nested_dict, prefix=''): + if hasattr(nested_dict, 'items'): + for k, v in nested_dict.items(): + count_fields(counts, v, prefix=prefix + '.' + k) + else: + counts[prefix] += 1 + + +def flatten(nested_dict, flat_dict, prefix=''): + if hasattr(nested_dict, 'items'): + for k, v in nested_dict.items(): + flatten(v, flat_dict, prefix=prefix + '.' + k) + else: + flat_dict[prefix] = nested_dict + + +def write_payslip_csv_month_rows(categories, csv_table): + with open('payslips-month-rows.csv', 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=categories) + writer.writeheader() + for row in csv_table: + writer.writerow(row) + + +def write_payslip_csv_month_columns(columns, csv_table): + with open('payslips-month-columns.csv', 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=columns) + # writer.writeheader() + for row in csv_table: + writer.writerow(row) + + +def partition(pred, iterable): + 'Use a predicate to partition entries into false entries and true entries' + # partition(is_odd, range(10)) --> 0 2 4 6 8 and 1 3 5 7 9 + from itertools import tee + from itertools import filterfalse + t1, t2 = tee(iterable) + return filterfalse(pred, t1), filter(pred, t2) + + +def enforce_order(iterable, prefixes: list): + remainder = iterable + result = [] + for prefix in prefixes: + remainder, matching = partition(lambda x: x.startswith(prefix), remainder) + remainder = list(remainder) + result += sorted(matching) + result += sorted(remainder) + return result + + +if __name__ == '__main__': + payslips_dir = Path(sys.argv[1]) + counts = collections.Counter() + csv_rows_table = [] + for payslip_file in sorted(payslips_dir.glob('*.txt')): + # if payslip_file.name < '2018-04-' or payslip_file.name > '2019-04-': + # continue + payslip_text = payslip_file.read_text(encoding='utf-8') + if 'Employee Number' not in payslip_text: + print(f"Skipping {payslip_file} ...") + continue + print(f"Parsing {payslip_file} ...") + payslip = parse_payslip(payslip_text) + + count_fields(counts, payslip) + flat_payslip = {} + flatten(payslip, flat_payslip) + csv_rows_table.append(flat_payslip) + + categories = counts.keys() + categories = enforce_order(categories, FIELDS_ORDER) + + # pprint('\n'.join(categories)) + # print(len(categories)) + write_payslip_csv_month_rows(categories, csv_rows_table) + + for unwanted_field in UNWANTED_FIELDS: + categories.remove(unwanted_field) + + csv_cols_table = [] + columns = [HEADER_FIELD, *[payslip[HEADER_FIELD] for payslip in csv_rows_table]] + for category in categories: + category_row = OrderedDict() + category_row[HEADER_FIELD] = category + for payslip in csv_rows_table: + month = payslip[HEADER_FIELD] + category_row[month] = payslip.get(category) + csv_cols_table.append(category_row) + + write_payslip_csv_month_columns(columns, csv_cols_table) + print("Done.") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f63c058 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +-i https://pypi.org/simple +chardet==3.0.4 +pdfminer.six==20181108 +pycryptodome==3.8.2 +six==1.12.0 +sortedcontainers==2.1.0 diff --git a/text_payslips/.keep b/text_payslips/.keep new file mode 100644 index 0000000..e69de29 diff --git a/to_text.py b/to_text.py new file mode 100644 index 0000000..7ec3ea1 --- /dev/null +++ b/to_text.py @@ -0,0 +1,49 @@ +import sys +from pathlib import Path + +import pdfminer.high_level +import pdfminer.layout +import pdfminer.settings + +pdfminer.settings.STRICT = False + + +def extract_text(pdf_file, outfile, + no_laparams=False, all_texts=None, detect_vertical=None, # LAParams + word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams + output_type='text', codec='utf-8', strip_control=False, + maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0, + layoutmode='normal', output_dir=None, debug=False, + disable_caching=False, **other): + if not pdf_file: + raise ValueError("Must provide file to work upon!") + + # If any LAParams group arguments were passed, create an LAParams object and + # populate with given args. Otherwise, set it to None. + if not no_laparams: + laparams = pdfminer.layout.LAParams() + for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): + paramv = locals().get(param, None) + if paramv is not None: + setattr(laparams, param, paramv) + else: + laparams = None + + with open(outfile, "wb") as outfp: + with open(pdf_file, "rb") as fp: + pdfminer.high_level.extract_text_to_fp(fp, **locals()) + + +if __name__ == '__main__': + print(sys.argv) + source_dir = Path(sys.argv[1]).resolve() + dest_dir = Path(sys.argv[2]).resolve() + for pdf_file in sorted(source_dir.glob('*.pdf')): + txt_file = dest_dir.joinpath(pdf_file.name).with_suffix('.txt') + print(pdf_file) + print(txt_file) + if txt_file.exists(): + print("Already exists. Skipping...") + else: + print("Extracting...") + extract_text(pdf_file=pdf_file, outfile=txt_file)