commit e85fcd00f38bad324f4dcdb4524fbb6f2d84b1c0 Author: Ian Griffin Date: Sun Aug 6 11:43:43 2023 +0700 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..92afa22 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +venv/ diff --git a/data_prepare.py b/data_prepare.py new file mode 100755 index 0000000..98dd97b --- /dev/null +++ b/data_prepare.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python + +# bogded-up together script to run functions to process the dataset + +import argparse +import jsonlines +import os +import shutil +import sys +from resolve_coref import resolve_coref +from descer import descer + +# parameters +raw_file_dir = "../tunghai_info.raw" # raw unprocessed dataset +convert_functions = {descer} # functions to run +result_file_dir = "../tunghai_info" # where to put processed dataset +verbosity = True + +# prepare result dir (delete the old result dir if exist) +if os.path.isdir(result_file_dir): + shutil.rmtree(result_file_dir) +os.makedirs(result_file_dir) + +# dataset dir should have labels.jsonl file to get the description +labels = jsonlines.open(raw_file_dir + '/labels.jsonl') +for label in labels: + if verbosity == True: + sys.stderr.write("processing " + label['file_name'] + '\n') + + + # open raw file + raw_file = open(raw_file_dir + '/' + label['file_name'], "r") + file_text = ''.join(raw_file.readlines()) + + # process file in memory + for func in convert_functions: + file_text = func(file_text, label['doc_label']) + + # write result + targ_file = open(result_file_dir + '/' + label['file_name'], "w") + targ_file.write(file_text) + + if verbosity == True: + sys.stderr.write("done processing " + label['file_name'] + '\n') \ No newline at end of file diff --git a/descer.py b/descer.py new file mode 100755 index 0000000..413292f --- /dev/null +++ b/descer.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python + +import sys + + +def descer(doc_text: str, doc_label: str): + ''' adds descriptive labels on top and bottom of doc_text, and returns the resulting complete text with the aforementioned descriptive labels''' + return "The following text is about " + doc_label + "\n\n" + doc_text + "\n\nThis is the end of "+ doc_label + " text\n" + +# if run as shell command, takes stdin as input, stdout as output +if __name__ == "__main__": + print(descer(sys.stdin.read(), ' '.join(sys.argv[1:]))) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c923052 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,30 @@ +attrs==23.1.0 +blis==0.2.4 +boto3==1.28.17 +botocore==1.31.17 +certifi==2023.7.22 +charset-normalizer==3.2.0 +cymem==2.0.7 +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#sha256=4db16860a8cdef56d436038ace6abeb9181a5176bdc8c16c755a20dced51e5f1 +idna==3.4 +importlib-metadata==6.7.0 +jmespath==1.0.1 +jsonlines==3.1.0 +jsonschema==2.6.0 +murmurhash==1.0.9 +neuralcoref==4.0 +numpy==1.21.6 +plac==0.9.6 +preshed==2.0.1 +python-dateutil==2.8.2 +requests==2.31.0 +s3transfer==0.6.1 +six==1.16.0 +spacy==2.1.0 +srsly==1.0.7 +thinc==7.0.8 +tqdm==4.65.0 +typing_extensions==4.7.1 +urllib3==1.26.16 +wasabi==0.10.1 +zipp==3.15.0 diff --git a/resolve_coref.py b/resolve_coref.py new file mode 100755 index 0000000..150e67d --- /dev/null +++ b/resolve_coref.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python + +import neuralcoref +import spacy +import sys + +def resolve_coref(doc_text, doc_label=""): + '''removes coreferences in doc_text using neuralcoref''' + nlp = spacy.load('en_core_web_sm') + neuralcoref.add_to_pipe(nlp) + return nlp(doc_text)._.coref_resolved + +if __name__ == "__main__": + print(resolve_coref(sys.stdin.read())) \ No newline at end of file