initial commit
This commit is contained in:
commit
e85fcd00f3
|
|
@ -0,0 +1,2 @@
|
||||||
|
__pycache__/
|
||||||
|
venv/
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# bogded-up together script to run functions to process the dataset
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import jsonlines
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
from resolve_coref import resolve_coref
|
||||||
|
from descer import descer
|
||||||
|
|
||||||
|
# parameters
|
||||||
|
raw_file_dir = "../tunghai_info.raw" # raw unprocessed dataset
|
||||||
|
convert_functions = {descer} # functions to run
|
||||||
|
result_file_dir = "../tunghai_info" # where to put processed dataset
|
||||||
|
verbosity = True
|
||||||
|
|
||||||
|
# prepare result dir (delete the old result dir if exist)
|
||||||
|
if os.path.isdir(result_file_dir):
|
||||||
|
shutil.rmtree(result_file_dir)
|
||||||
|
os.makedirs(result_file_dir)
|
||||||
|
|
||||||
|
# dataset dir should have labels.jsonl file to get the description
|
||||||
|
labels = jsonlines.open(raw_file_dir + '/labels.jsonl')
|
||||||
|
for label in labels:
|
||||||
|
if verbosity == True:
|
||||||
|
sys.stderr.write("processing " + label['file_name'] + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
# open raw file
|
||||||
|
raw_file = open(raw_file_dir + '/' + label['file_name'], "r")
|
||||||
|
file_text = ''.join(raw_file.readlines())
|
||||||
|
|
||||||
|
# process file in memory
|
||||||
|
for func in convert_functions:
|
||||||
|
file_text = func(file_text, label['doc_label'])
|
||||||
|
|
||||||
|
# write result
|
||||||
|
targ_file = open(result_file_dir + '/' + label['file_name'], "w")
|
||||||
|
targ_file.write(file_text)
|
||||||
|
|
||||||
|
if verbosity == True:
|
||||||
|
sys.stderr.write("done processing " + label['file_name'] + '\n')
|
||||||
|
|
@ -0,0 +1,12 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def descer(doc_text: str, doc_label: str):
|
||||||
|
''' adds descriptive labels on top and bottom of doc_text, and returns the resulting complete text with the aforementioned descriptive labels'''
|
||||||
|
return "The following text is about " + doc_label + "\n\n" + doc_text + "\n\nThis is the end of "+ doc_label + " text\n"
|
||||||
|
|
||||||
|
# if run as shell command, takes stdin as input, stdout as output
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print(descer(sys.stdin.read(), ' '.join(sys.argv[1:])))
|
||||||
|
|
@ -0,0 +1,30 @@
|
||||||
|
attrs==23.1.0
|
||||||
|
blis==0.2.4
|
||||||
|
boto3==1.28.17
|
||||||
|
botocore==1.31.17
|
||||||
|
certifi==2023.7.22
|
||||||
|
charset-normalizer==3.2.0
|
||||||
|
cymem==2.0.7
|
||||||
|
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#sha256=4db16860a8cdef56d436038ace6abeb9181a5176bdc8c16c755a20dced51e5f1
|
||||||
|
idna==3.4
|
||||||
|
importlib-metadata==6.7.0
|
||||||
|
jmespath==1.0.1
|
||||||
|
jsonlines==3.1.0
|
||||||
|
jsonschema==2.6.0
|
||||||
|
murmurhash==1.0.9
|
||||||
|
neuralcoref==4.0
|
||||||
|
numpy==1.21.6
|
||||||
|
plac==0.9.6
|
||||||
|
preshed==2.0.1
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
requests==2.31.0
|
||||||
|
s3transfer==0.6.1
|
||||||
|
six==1.16.0
|
||||||
|
spacy==2.1.0
|
||||||
|
srsly==1.0.7
|
||||||
|
thinc==7.0.8
|
||||||
|
tqdm==4.65.0
|
||||||
|
typing_extensions==4.7.1
|
||||||
|
urllib3==1.26.16
|
||||||
|
wasabi==0.10.1
|
||||||
|
zipp==3.15.0
|
||||||
|
|
@ -0,0 +1,14 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import neuralcoref
|
||||||
|
import spacy
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def resolve_coref(doc_text, doc_label=""):
|
||||||
|
'''removes coreferences in doc_text using neuralcoref'''
|
||||||
|
nlp = spacy.load('en_core_web_sm')
|
||||||
|
neuralcoref.add_to_pipe(nlp)
|
||||||
|
return nlp(doc_text)._.coref_resolved
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print(resolve_coref(sys.stdin.read()))
|
||||||
Loading…
Reference in New Issue