Source code for scripts.standard_to_sentences

#!/usr/bin/env python3

import argparse
import os
import sys
from itertools import count
from logging import warning

from ucca.convert import split2sentences, split_passage
from ucca.ioutil import passage2file, get_passages_with_progress_bar, external_write_mode
from ucca.normalization import normalize
from ucca.textutil import extract_terminals

desc = """Parses XML files in UCCA standard format, and writes a passage per sentence."""

NUM_NODES_WARNING = 500  # Warn if a sentence has more than this many nodes


[docs]class Splitter: def __init__(self, sentences, enum=False, suffix_format="%03d", suffix_start=0): self.sentences = sentences self.sentence_to_index = {} for i, sentence in enumerate(sentences): self.sentence_to_index.setdefault(sentence, []).append(i) self.enumerate = enum self.suffix_format = suffix_format self.suffix_start = suffix_start self.index = 0 self.matched_indices = set()
[docs] @classmethod def read_file(cls, filename, **kwargs): if filename is None: return None with open(filename, encoding="utf-8") as f: sentences = [line.strip() for line in f] return cls(sentences, **kwargs)
[docs] def split(self, passage): ends = [] ids = [] token_lists = [] for terminal in extract_terminals(passage): token_lists.append([]) for terminals in token_lists if self.index is None else [token_lists[0]]: terminals.append(terminal) sentence = " ".join(t.text for t in terminals) if self.index is not None and self.index < len(self.sentences) and self.sentences[ self.index].startswith(sentence): # Try matching next sentence rather than shortest index = self.index if self.sentences[self.index] == sentence else None else: indices = self.sentence_to_index.get(sentence) index = self.index = indices.pop(0) if indices else None if index is not None: self.matched_indices.add(index) last_end = terminals[0].position - 1 if len(terminals) > 1 and last_end and last_end not in ends: ends.append(last_end) ends.append(terminal.position) ids.append(str(index)) token_lists = [] self.index += 1 break return split_passage(passage, ends, ids=ids if self.enumerate else None, suffix_format=self.suffix_format, suffix_start=self.suffix_start)
[docs]def main(args): splitter = Splitter.read_file(args.sentences, enum=args.enumerate, suffix_format=args.suffix_format, suffix_start=args.suffix_start) os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in splitter.split(passage) if splitter else split2sentences( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) if len(sentence.nodes) > NUM_NODES_WARNING: warning(f"Sentence {i} in passage {passage.ID} has {len(sentence.nodes)} > {NUM_NODES_WARNING} nodes") if args.verbose: with external_write_mode(): print(sentence, file=sys.stderr) print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(sentence) passage2file(sentence, outfile, binary=args.binary) if splitter and len(splitter.matched_indices) < len(splitter.sentences): print("", "Unmatched sentences:", *[s for i, s in enumerate(splitter.sentences) if i not in splitter.matched_indices], sep="\n")
if __name__ == "__main__": argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="passage file names to convert") argparser.add_argument("-o", "--outdir", default=".", help="output directory") argparser.add_argument("-p", "--prefix", default="", help="output filename prefix") argparser.add_argument("-f", "--suffix-format", default="%03d", help="sentence number suffix format") argparser.add_argument("-i", "--suffix-start", type=int, default=0, help="start index for number suffix") argparser.add_argument("-r", "--remarks", action="store_true", help="annotate original IDs") argparser.add_argument("-l", "--lang", default="en", help="language two-letter code for sentence model") argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)") argparser.add_argument("-s", "--sentences", help="optional input file with sentence at each line to split by") argparser.add_argument("-e", "--enumerate", action="store_true", help="set each output sentence ID by global order") argparser.add_argument("-N", "--no-normalize", dest="normalize", action="store_false", help="do not normalize passages after splitting") argparser.add_argument("-v", "--verbose", action="store_true", help="print information about every split sentence") main(argparser.parse_args())