initial commit
This commit is contained in:
commit
e85fcd00f3
|
|
@ -0,0 +1,2 @@
|
|||
__pycache__/
|
||||
venv/
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# bogded-up together script to run functions to process the dataset
|
||||
|
||||
import argparse
|
||||
import jsonlines
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from resolve_coref import resolve_coref
|
||||
from descer import descer
|
||||
|
||||
# parameters
|
||||
raw_file_dir = "../tunghai_info.raw" # raw unprocessed dataset
|
||||
convert_functions = {descer} # functions to run
|
||||
result_file_dir = "../tunghai_info" # where to put processed dataset
|
||||
verbosity = True
|
||||
|
||||
# prepare result dir (delete the old result dir if exist)
|
||||
if os.path.isdir(result_file_dir):
|
||||
shutil.rmtree(result_file_dir)
|
||||
os.makedirs(result_file_dir)
|
||||
|
||||
# dataset dir should have labels.jsonl file to get the description
|
||||
labels = jsonlines.open(raw_file_dir + '/labels.jsonl')
|
||||
for label in labels:
|
||||
if verbosity == True:
|
||||
sys.stderr.write("processing " + label['file_name'] + '\n')
|
||||
|
||||
|
||||
# open raw file
|
||||
raw_file = open(raw_file_dir + '/' + label['file_name'], "r")
|
||||
file_text = ''.join(raw_file.readlines())
|
||||
|
||||
# process file in memory
|
||||
for func in convert_functions:
|
||||
file_text = func(file_text, label['doc_label'])
|
||||
|
||||
# write result
|
||||
targ_file = open(result_file_dir + '/' + label['file_name'], "w")
|
||||
targ_file.write(file_text)
|
||||
|
||||
if verbosity == True:
|
||||
sys.stderr.write("done processing " + label['file_name'] + '\n')
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
def descer(doc_text: str, doc_label: str):
|
||||
''' adds descriptive labels on top and bottom of doc_text, and returns the resulting complete text with the aforementioned descriptive labels'''
|
||||
return "The following text is about " + doc_label + "\n\n" + doc_text + "\n\nThis is the end of "+ doc_label + " text\n"
|
||||
|
||||
# if run as shell command, takes stdin as input, stdout as output
|
||||
if __name__ == "__main__":
|
||||
print(descer(sys.stdin.read(), ' '.join(sys.argv[1:])))
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
attrs==23.1.0
|
||||
blis==0.2.4
|
||||
boto3==1.28.17
|
||||
botocore==1.31.17
|
||||
certifi==2023.7.22
|
||||
charset-normalizer==3.2.0
|
||||
cymem==2.0.7
|
||||
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#sha256=4db16860a8cdef56d436038ace6abeb9181a5176bdc8c16c755a20dced51e5f1
|
||||
idna==3.4
|
||||
importlib-metadata==6.7.0
|
||||
jmespath==1.0.1
|
||||
jsonlines==3.1.0
|
||||
jsonschema==2.6.0
|
||||
murmurhash==1.0.9
|
||||
neuralcoref==4.0
|
||||
numpy==1.21.6
|
||||
plac==0.9.6
|
||||
preshed==2.0.1
|
||||
python-dateutil==2.8.2
|
||||
requests==2.31.0
|
||||
s3transfer==0.6.1
|
||||
six==1.16.0
|
||||
spacy==2.1.0
|
||||
srsly==1.0.7
|
||||
thinc==7.0.8
|
||||
tqdm==4.65.0
|
||||
typing_extensions==4.7.1
|
||||
urllib3==1.26.16
|
||||
wasabi==0.10.1
|
||||
zipp==3.15.0
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import neuralcoref
|
||||
import spacy
|
||||
import sys
|
||||
|
||||
def resolve_coref(doc_text, doc_label=""):
|
||||
'''removes coreferences in doc_text using neuralcoref'''
|
||||
nlp = spacy.load('en_core_web_sm')
|
||||
neuralcoref.add_to_pipe(nlp)
|
||||
return nlp(doc_text)._.coref_resolved
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(resolve_coref(sys.stdin.read()))
|
||||
Loading…
Reference in New Issue