#!/usr/bin/env python # bogded-up together script to run functions to process the dataset import argparse import jsonlines import os import shutil import sys from resolve_coref import resolve_coref from descer import descer # parameters raw_file_dir = "../tunghai_info.raw" # raw unprocessed dataset convert_functions = {descer} # functions to run result_file_dir = "../tunghai_info" # where to put processed dataset verbosity = True # prepare result dir (delete the old result dir if exist) if os.path.isdir(result_file_dir): shutil.rmtree(result_file_dir) os.makedirs(result_file_dir) # dataset dir should have labels.jsonl file to get the description labels = jsonlines.open(raw_file_dir + '/labels.jsonl') for label in labels: if verbosity == True: sys.stderr.write("processing " + label['file_name'] + '\n') # open raw file raw_file = open(raw_file_dir + '/' + label['file_name'], "r") file_text = ''.join(raw_file.readlines()) # process file in memory for func in convert_functions: file_text = func(file_text, label['doc_label']) # write result targ_file = open(result_file_dir + '/' + label['file_name'], "w") targ_file.write(file_text) if verbosity == True: sys.stderr.write("done processing " + label['file_name'] + '\n')