dataset_preparer/data_prepare.py

44 lines
1.3 KiB
Python
Executable File

#!/usr/bin/env python
# bogded-up together script to run functions to process the dataset
import argparse
import jsonlines
import os
import shutil
import sys
from resolve_coref import resolve_coref
from descer import descer
# parameters
raw_file_dir = "../tunghai_info.raw" # raw unprocessed dataset
convert_functions = {descer} # functions to run
result_file_dir = "../tunghai_info" # where to put processed dataset
verbosity = True
# prepare result dir (delete the old result dir if exist)
if os.path.isdir(result_file_dir):
shutil.rmtree(result_file_dir)
os.makedirs(result_file_dir)
# dataset dir should have labels.jsonl file to get the description
labels = jsonlines.open(raw_file_dir + '/labels.jsonl')
for label in labels:
if verbosity == True:
sys.stderr.write("processing " + label['file_name'] + '\n')
# open raw file
raw_file = open(raw_file_dir + '/' + label['file_name'], "r")
file_text = ''.join(raw_file.readlines())
# process file in memory
for func in convert_functions:
file_text = func(file_text, label['doc_label'])
# write result
targ_file = open(result_file_dir + '/' + label['file_name'], "w")
targ_file.write(file_text)
if verbosity == True:
sys.stderr.write("done processing " + label['file_name'] + '\n')