Skip to content

Commit

Permalink
Adding script to cut and reorder files by their column names
Browse files Browse the repository at this point in the history
  • Loading branch information
skchronicles committed Sep 8, 2023
1 parent 48c1c92 commit aceb8b7
Showing 1 changed file with 311 additions and 0 deletions.
311 changes: 311 additions & 0 deletions workflow/scripts/cut2tsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,311 @@
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
# Author: Skyler Kuhn

# Standard Library
from __future__ import print_function
import sys, os
import textwrap

# 3rd party packages,
# installed from pypi
import argparse
import pandas as pd


_help = textwrap.dedent("""
@Usage:
$ ./cut2tsv.py [-h] \\
-c COL_A [COL_B COL_Y COL_Z] \\
-i FILE
@About:
Given an input file and 1 or more column
names to extract, this script will parse
each of those columns by their name. The
script can also be used to re-order the
columns of a file by their name too. With
that being said, all operations performed
by this script work on column names.
To perform specific operations on columns
by their column their index (i.e. 1, 2, 3
...), please use the built-in cut command
or awk.
All output is directed to standard output
in (awk-friendly) tab-delimited format. It
can be captured, redirected, or piped into
another process.
@Required Arguments:
-c, --column COL One or more column names
to extract. The order of
the columns will be the
the order of the output.
If a column is provided
that does not exist in
the input file, then a
warning message will be
produced, and it will
skip over the column.
-i, --input FILE Input file to extract or
reorder column(s) by their
name. The following input
file types are types are
supported:
+ TSV (tab-delimited)
+ CSV (comma-delimited)
+ Excel
@Options
--h, --help Shows help message and exits.
@Example
# Input file
$ cat file.tsv
a b c
1 2 3
4 5 6
7 8 9
# Subset and re-order
# the example input file
$ cut2tsv.py -c b a -i file.tsv
b a
2 1
5 4
8 7
"""
)


def err(*message, **kwargs):
"""Prints any provided args to standard error.
kwargs can be provided to modify print functions
behavior.
@param message <any>:
Values printed to standard error
@params kwargs <print()>
Key words to modify print function behavior
"""
print(*message, file=sys.stderr, **kwargs)



def fatal(*message, **kwargs):
"""Prints any provided args to standard error
and exits with an exit code of 1.
@param message <any>:
Values printed to standard error
@params kwargs <print()>
Key words to modify print function behavior
"""
err(*message, **kwargs)
sys.exit(1)


def read(filename, subset=[], skip='#', **kwargs):
"""Reads in an input file as a dataframe. Determines the
correct handler for reading in a given MAF file. Supports reading
in TSV files (.tsv, .txt, .text, .vcf, or .maf), CSV files (.csv),
and excel files (.xls, .xlsx, .xlsm, .xlsb, .odf, .ods, .odt ).
The subset option allows a users to only select a few columns
given a list of column names.
@param filename <str>:
Path of an MAF-like file to read and parse
@param subset list[<str>]:
List of column names which can be used to subset the df
@param skip <str>:
Skips over line starting with this character
@params kwargs <read_excel()>
Key words to modify pandas.read_excel() function behavior
@return <pandas dataframe>:
dataframe with spreadsheet contents
"""
# Get file extension
extension = os.path.splitext(filename)[-1].lower()

# Assign a handler to read in the file
if extension in ['.xls', '.xlsx', '.xlsm', '.xlsb', '.odf', '.ods', '.odt']:
# Read in as an excel file
return excel(filename, subset, skip, **kwargs)
elif extension in ['.csv']:
# Read in as an CSV file
return csv(filename, subset, skip, **kwargs)
else:
# Default to reading in as an TSV file
# Tab is the normal delimeter for MAF or VCF files
# MAF files usually have one of the following
# extensions: '.tsv', '.txt', '.text', '.vcf', '.maf'
return tsv(filename, subset, skip, **kwargs)


def excel(filename, subset=[], skip='#', **kwargs):
"""Reads in an excel file as a dataframe. The subset option
allows a users to only select a few columns given a list of
column names.
@param filename <str>:
Path of an EXCEL file to read and parse
@param subset list[<str>]:
List of column names which can be used to subset the df
@param skip <str>:
Skips over line starting with this character
@params kwargs <read_excel()>
Key words to modify pandas.read_excel() function behavior
@return <pandas dataframe>:
dataframe with spreadsheet contents
"""
if subset:
return pd.read_excel(filename, comment=skip, **kwargs)[subset]

return pd.read_excel(filename, comment=skip, **kwargs)


def tsv(filename, subset=[], skip='#', **kwargs):
"""Reads in an TSV file as a dataframe. The subset option
allows a users to only select a few columns given a list of
column names.
@param filename <str>:
Path of an TSV file to read and parse
@param subset list[<str>]:
List of column names which can be used to subset the df
@param skip <str>:
Skips over line starting with this character
@params kwargs <read_excel()>
Key words to modify pandas.read_excel() function behavior
@return <pandas dataframe>:
dataframe with spreadsheet contents
"""
if subset:
return pd.read_table(filename, comment=skip, **kwargs)[subset]

return pd.read_table(filename, comment=skip, **kwargs)


def csv(filename, subset=[], skip='#', **kwargs):
"""Reads in an CSV file as a dataframe. The subset option
allows a users to only select a few columns given a list of
column names.
@param filename <str>:
Path of an CSV file to read and parse
@param subset list[<str>]:
List of column names which can be used to subset the df
@param skip <str>:
Skips over line starting with this character
@params kwargs <read_excel()>
Key words to modify pandas.read_excel() function behavior
@return <pandas dataframe>:
dataframe with spreadsheet contents
"""
if subset:
return pd.read_csv(filename, comment=skip, **kwargs)[subset]

return pd.read_csv(filename, comment=skip, **kwargs)

def parse_arguments():
"""Returns an argparse object of parsed command-line args."""
# Parse command-line arguments
# Create a top-level parser
parser = argparse.ArgumentParser(
usage = argparse.SUPPRESS,
formatter_class=argparse.RawDescriptionHelpFormatter,
description = _help,
add_help=False
)

# Required Positional Arguments
# List of input files
parser.add_argument(
'-c', '--column',
required=True,
nargs = '+',
help = argparse.SUPPRESS
)
# Output file name
parser.add_argument(
'-i', '--input',
required=True,
type=str,
help = argparse.SUPPRESS
)

# Options
# Add custom help message
parser.add_argument(
'-h', '--help',
action='help',
help=argparse.SUPPRESS
)

# Collect parsed arguments
args = parser.parse_args()

return args


def main():
"""Collect command line args and build the matrix."""
# Parse command-line arguments
args = parse_arguments()

# Sanity check for usage
if len(sys.argv) == 1:
# Nothing was provided
fatal('Invalid usage: cut2tsv.py [-h] ...')

# Read in first line of
# input file to get its
# column names
input_header = read(
args.input,
nrows=1
).columns.tolist()

# Check if user columns do not
# exist in the input file
extract_columns = args.column
header_set = set(input_header)
extract_set = set(extract_columns)
missing = extract_set - header_set
if missing:
# Display warning to stderr
# and continue with processing
err(
"Warning: The following columns do not " +
"exist in your input file: '{}'. ".format(missing) +
"Skipping over these columns..."
)
# Remove missing columns
extract_columns = [c for c in extract_columns if c not in missing]

# Check if there is any to extract
# or re-order (i.e. non-empty list)
if not extract_columns:
fatal(
"Error: None of the provided column names " +
"exist in your input file!"
)

# Subset and/or re-order columns
df = read(
args.input,
subset = extract_columns
)

# Write the resulting df to stdout
df.to_csv(
sys.stdout,
sep = "\t",
header = True,
index = False
)


if __name__ == '__main__':
# Create the matrix
try:
main()
except BrokenPipeError:
pass

0 comments on commit aceb8b7

Please sign in to comment.