initial commit

This commit is contained in:
Ian Griffin 2023-08-06 11:43:43 +07:00
commit e85fcd00f3
5 changed files with 102 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
__pycache__/
venv/

44
data_prepare.py Executable file
View File

@ -0,0 +1,44 @@
#!/usr/bin/env python
# bogded-up together script to run functions to process the dataset
import argparse
import jsonlines
import os
import shutil
import sys
from resolve_coref import resolve_coref
from descer import descer
# parameters
raw_file_dir = "../tunghai_info.raw" # raw unprocessed dataset
convert_functions = {descer} # functions to run
result_file_dir = "../tunghai_info" # where to put processed dataset
verbosity = True
# prepare result dir (delete the old result dir if exist)
if os.path.isdir(result_file_dir):
shutil.rmtree(result_file_dir)
os.makedirs(result_file_dir)
# dataset dir should have labels.jsonl file to get the description
labels = jsonlines.open(raw_file_dir + '/labels.jsonl')
for label in labels:
if verbosity == True:
sys.stderr.write("processing " + label['file_name'] + '\n')
# open raw file
raw_file = open(raw_file_dir + '/' + label['file_name'], "r")
file_text = ''.join(raw_file.readlines())
# process file in memory
for func in convert_functions:
file_text = func(file_text, label['doc_label'])
# write result
targ_file = open(result_file_dir + '/' + label['file_name'], "w")
targ_file.write(file_text)
if verbosity == True:
sys.stderr.write("done processing " + label['file_name'] + '\n')

12
descer.py Executable file
View File

@ -0,0 +1,12 @@
#!/usr/bin/env python
import sys
def descer(doc_text: str, doc_label: str):
''' adds descriptive labels on top and bottom of doc_text, and returns the resulting complete text with the aforementioned descriptive labels'''
return "The following text is about " + doc_label + "\n\n" + doc_text + "\n\nThis is the end of "+ doc_label + " text\n"
# if run as shell command, takes stdin as input, stdout as output
if __name__ == "__main__":
print(descer(sys.stdin.read(), ' '.join(sys.argv[1:])))

30
requirements.txt Normal file
View File

@ -0,0 +1,30 @@
attrs==23.1.0
blis==0.2.4
boto3==1.28.17
botocore==1.31.17
certifi==2023.7.22
charset-normalizer==3.2.0
cymem==2.0.7
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#sha256=4db16860a8cdef56d436038ace6abeb9181a5176bdc8c16c755a20dced51e5f1
idna==3.4
importlib-metadata==6.7.0
jmespath==1.0.1
jsonlines==3.1.0
jsonschema==2.6.0
murmurhash==1.0.9
neuralcoref==4.0
numpy==1.21.6
plac==0.9.6
preshed==2.0.1
python-dateutil==2.8.2
requests==2.31.0
s3transfer==0.6.1
six==1.16.0
spacy==2.1.0
srsly==1.0.7
thinc==7.0.8
tqdm==4.65.0
typing_extensions==4.7.1
urllib3==1.26.16
wasabi==0.10.1
zipp==3.15.0

14
resolve_coref.py Executable file
View File

@ -0,0 +1,14 @@
#!/usr/bin/env python
import neuralcoref
import spacy
import sys
def resolve_coref(doc_text, doc_label=""):
'''removes coreferences in doc_text using neuralcoref'''
nlp = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp)
return nlp(doc_text)._.coref_resolved
if __name__ == "__main__":
print(resolve_coref(sys.stdin.read()))