Source code for clean

#!/usr/bin/env python
# _*_ coding: utf-8 _*_

import argparse
from functools import partial
from itertools import chain
from operator import itemgetter
import os

from karld import is_py3


if is_py3():
    unicode = str

from karld.loadump import is_file_csv
from karld.run_together import csv_file_to_file
from karld.run_together import pool_run_files_to_files
from karld.run_together import serial_run_files_to_files


[docs]def contrived_cleaner(data_items): """ Sort the data by the second row, enumerate it, apply title case to every field and include the original index and sorted in the in the row. :param data_items: A sequence of unicode strings """ original_index_added = ( (unicode(o_index), item[0].title(), item[1].title()) for o_index, item in enumerate(data_items) ) ROWTWO = 2 rowtwo_getter = itemgetter(ROWTWO) items = tuple(tuple(chain([index], row)) for index, row in enumerate( sorted( original_index_added , key=rowtwo_getter ) ) ) return items
[docs]def run(in_dir, out_dir, pool): """ """ files_to_files_runner = serial_run_files_to_files if pool: print("multi-processing") files_to_files_runner = pool_run_files_to_files files_to_files_runner( partial(csv_file_to_file, contrived_cleaner, "", out_dir), in_dir, filter_func=is_file_csv)
[docs]def main(*args): """ Try it:: python clean.py or:: python clean.py --pool True or:: python clean.py --in-dir split_data_ml/data --out-dir my_clean_data or:: python clean.py --pool True --in-dir split_data_ml/data """ parser = argparse.ArgumentParser(*args) parser.add_argument("--in-dir", default=os.path.join("split_data_ml", "data"), help="Data source directory") parser.add_argument("--out-dir", default="clean_data", help="Data output directory") parser.add_argument("--pool", default=False) args = parser.parse_args() run(args.in_dir, args.out_dir, args.pool)
if __name__ == "__main__": main()