44 lines
1.3 KiB
Python
Executable File
44 lines
1.3 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
# bogded-up together script to run functions to process the dataset
|
|
|
|
import argparse
|
|
import jsonlines
|
|
import os
|
|
import shutil
|
|
import sys
|
|
from resolve_coref import resolve_coref
|
|
from descer import descer
|
|
|
|
# parameters
|
|
raw_file_dir = "../tunghai_info.raw" # raw unprocessed dataset
|
|
convert_functions = {descer} # functions to run
|
|
result_file_dir = "../tunghai_info" # where to put processed dataset
|
|
verbosity = True
|
|
|
|
# prepare result dir (delete the old result dir if exist)
|
|
if os.path.isdir(result_file_dir):
|
|
shutil.rmtree(result_file_dir)
|
|
os.makedirs(result_file_dir)
|
|
|
|
# dataset dir should have labels.jsonl file to get the description
|
|
labels = jsonlines.open(raw_file_dir + '/labels.jsonl')
|
|
for label in labels:
|
|
if verbosity == True:
|
|
sys.stderr.write("processing " + label['file_name'] + '\n')
|
|
|
|
|
|
# open raw file
|
|
raw_file = open(raw_file_dir + '/' + label['file_name'], "r")
|
|
file_text = ''.join(raw_file.readlines())
|
|
|
|
# process file in memory
|
|
for func in convert_functions:
|
|
file_text = func(file_text, label['doc_label'])
|
|
|
|
# write result
|
|
targ_file = open(result_file_dir + '/' + label['file_name'], "w")
|
|
targ_file.write(file_text)
|
|
|
|
if verbosity == True:
|
|
sys.stderr.write("done processing " + label['file_name'] + '\n') |