Python tokenization.convert_to_unicode() Examples

The following are 30 code examples of tokenization.convert_to_unicode(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tokenization , or try the search function .
Example #1
Source File: run_classifier.py    From models with Apache License 2.0 6 votes vote down vote up
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "%s-%s" % (set_type, i)
      text_a = tokenization.convert_to_unicode(line[3])
      text_b = tokenization.convert_to_unicode(line[4])
      if set_type == "test":
        label = "0"
      else:
        label = tokenization.convert_to_unicode(line[0])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
Example #2
Source File: sentence_embedding.py    From SOQAL with MIT License 6 votes vote down vote up
def read_examples(text):
  """return example format from string"""
  examples = []
  unique_id = 0
  text = text.replace('\n',' ') #remove line breaks
  line = tokenization.convert_to_unicode(text)
  line = line.strip()
  text_a = None
  text_b = None
  m = re.match(r"^(.*) \|\|\| (.*)$", line)
  if m is None:
    text_a = line
  else:
    text_a = m.group(1)
    text_b = m.group(2)
  examples.append(
      InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
  unique_id += 1
  return examples 
Example #3
Source File: run_classifier.py    From models with Apache License 2.0 6 votes vote down vote up
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      # Only the test set has a header
      if set_type == "test" and i == 0:
        continue
      guid = "%s-%s" % (set_type, i)
      if set_type == "test":
        text_a = tokenization.convert_to_unicode(line[1])
        label = "0"
      else:
        text_a = tokenization.convert_to_unicode(line[3])
        label = tokenization.convert_to_unicode(line[1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples 
Example #4
Source File: run_classifier.py    From models with Apache License 2.0 6 votes vote down vote up
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "%s-%s" % (set_type, i)
      text_a = tokenization.convert_to_unicode(line[3])
      text_b = tokenization.convert_to_unicode(line[4])
      if set_type == "test":
        label = "0"
      else:
        label = tokenization.convert_to_unicode(line[0])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
Example #5
Source File: run_classifier.py    From text_bert_cnn with MIT License 6 votes vote down vote up
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      # Only the test set has a header
      if set_type == "test" and i == 0:
        continue
      guid = "%s-%s" % (set_type, i)
      if set_type == "test":
        text_a = tokenization.convert_to_unicode(line[1])
        label = "0"
      else:
        text_a = tokenization.convert_to_unicode(line[3])
        label = tokenization.convert_to_unicode(line[1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples 
Example #6
Source File: extract_features.py    From DeepCT with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def read_examples(input_file):
  """Read a list of `InputExample`s from an input file."""
  examples = []
  unique_id = 0
  with tf.gfile.GFile(input_file, "r") as reader:
    while True:
      line = tokenization.convert_to_unicode(reader.readline())
      if not line:
        break
      line = line.strip()
      text_a = None
      text_b = None
      m = re.match(r"^(.*) \|\|\| (.*)$", line)
      if m is None:
        text_a = line
      else:
        text_a = m.group(1)
        text_b = m.group(2)
      examples.append(
          InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
      unique_id += 1
  return examples 
Example #7
Source File: run_deepct.py    From DeepCT with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_train_examples(self, data_dir):
        examples = []
        train_files = ["train.fold0.docterm_recall", "train.fold1.docterm_recall", "train.fold2.docterm_recall", "train.fold3.docterm_recall"]

        for file_name in train_files:
            train_file = open(os.path.join(data_dir, file_name))
            for i, line in enumerate(train_file):
                json_dict = json.loads(line)
                docid = json_dict["doc"]["id"]
                doc_text = tokenization.convert_to_unicode(json_dict["doc"]["title"])
                term_recall_dict = json_dict["term_recall"]
                if not term_recall_dict or not doc_text.strip():
                    continue

                guid = "train-%s" % docid
                examples.append(
                    InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict)
                )
            train_file.close()
        random.shuffle(examples)
        return examples 
Example #8
Source File: run_deepct.py    From DeepCT with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_dev_examples(self, data_dir):
        dev_files = ["train.fold4.docterm_recall.firsthalf"]
        examples = []

        for file_name in dev_files:
            dev_file = open(os.path.join(data_dir, file_name))
            for i, line in enumerate(dev_file):
                json_dict = json.loads(line)
                docid = json_dict["doc"]["id"]
                doc_text = tokenization.convert_to_unicode(json_dict["doc"]["title"])
                term_recall_dict = json_dict["term_recall"]

                guid = "dev-%s" % docid
                examples.append(
                    InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict)
                )
            dev_file.close()
        return examples 
Example #9
Source File: run_deepct.py    From DeepCT with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_test_examples(self, data_dir):
        test_files = ["train.fold4.docterm_recall.secondhalf"]
        examples = []

        for file_name in test_files:
            test_file = open(os.path.join(data_dir, file_name))
            for i, line in enumerate(test_file):
                json_dict = json.loads(line)
                docid = json_dict["doc"]["id"]
                doc_text = tokenization.convert_to_unicode(json_dict["doc"]["title"])
                term_recall_dict = json_dict["term_recall"]

                guid = "test-%s" % docid
                examples.append(
                    InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict)
                )
            test_file.close()
        return examples 
Example #10
Source File: run_deepct.py    From DeepCT with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_train_examples(self, data_dir):
        examples = []
        train_files = [data_dir]

        for file_name in train_files:
            train_file = open(file_name)
            for i, line in enumerate(train_file):
                json_dict = json.loads(line)
                docid = json_dict["doc"]["id"]
                doc_text = tokenization.convert_to_unicode(json_dict["doc"]["title"])
                term_recall_dict = json_dict["term_recall"]

                guid = "train-%s" % docid
                examples.append(
                    InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict)
                )
            train_file.close()
        random.shuffle(examples)
        return examples 
Example #11
Source File: run_deepct.py    From DeepCT with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_test_examples(self, data_dir):
        test_files = [data_dir]
        examples = []

        for file_name in test_files:
            test_file = open(file_name)
            for i, line in enumerate(test_file):
                jdict = json.loads(line)
                docid = jdict["id"]
                doc_text = jdict[FLAGS.doc_field]
                doc_text = tokenization.convert_to_unicode(doc_text)
                doc_text = truncate_and_clean_trec_19_doc(doc_text, FLAGS.max_body_length)
                term_recall_dict = {}
                if not doc_text.strip():
                     doc_text = '.'
                #    tf.logging.info("skipping {}".format(docid))
                #    continue

                guid = "test-%s" % docid
                examples.append(
                    InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict)
                )
            test_file.close()
        return examples 
Example #12
Source File: run_deepct.py    From DeepCT with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_test_examples(self, data_dir):
        test_files = [data_dir]
        examples = []

        for file_name in test_files:
            test_file = open(file_name)
            for i, line in enumerate(test_file):
                jdict = json.loads(line)
                docid = jdict["id"]
                doc_text = jdict["content"]
                doc_text = tokenization.convert_to_unicode(doc_text)
                term_recall_dict = {}
                if not doc_text.strip():
                     doc_text = '.'

                guid = "test-%s" % docid
                examples.append(
                    InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict)
                )
            test_file.close()
        return examples 
Example #13
Source File: run_classifier.py    From text_bert_cnn with MIT License 6 votes vote down vote up
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
      text_a = tokenization.convert_to_unicode(line[8])
      text_b = tokenization.convert_to_unicode(line[9])
      if set_type == "test":
        label = "contradiction"
      else:
        label = tokenization.convert_to_unicode(line[-1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
Example #14
Source File: extract_features.py    From models with Apache License 2.0 6 votes vote down vote up
def read_examples(input_file):
  """Read a list of `InputExample`s from an input file."""
  examples = []
  unique_id = 0
  with tf.io.gfile.GFile(input_file, "r") as reader:
    while True:
      line = tokenization.convert_to_unicode(reader.readline())
      if not line:
        break
      line = line.strip()
      text_a = None
      text_b = None
      m = re.match(r"^(.*) \|\|\| (.*)$", line)
      if m is None:
        text_a = line
      else:
        text_a = m.group(1)
        text_b = m.group(2)
      examples.append(
          InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
      unique_id += 1
  return examples 
Example #15
Source File: run_classifier.py    From models with Apache License 2.0 6 votes vote down vote up
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      # Only the test set has a header
      if set_type == "test" and i == 0:
        continue
      guid = "%s-%s" % (set_type, i)
      if set_type == "test":
        text_a = tokenization.convert_to_unicode(line[1])
        label = "0"
      else:
        text_a = tokenization.convert_to_unicode(line[3])
        label = tokenization.convert_to_unicode(line[1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples 
Example #16
Source File: run_classifier.py    From models with Apache License 2.0 6 votes vote down vote up
def get_train_examples(self, data_dir):
    """See base class."""
    lines = self._read_tsv(
        os.path.join(data_dir, "multinli",
                     "multinli.train.%s.tsv" % self.language))
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "train-%d" % (i)
      text_a = tokenization.convert_to_unicode(line[0])
      text_b = tokenization.convert_to_unicode(line[1])
      label = tokenization.convert_to_unicode(line[2])
      if label == tokenization.convert_to_unicode("contradictory"):
        label = tokenization.convert_to_unicode("contradiction")
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
Example #17
Source File: run_classifier.py    From models with Apache License 2.0 6 votes vote down vote up
def get_dev_examples(self, data_dir):
    """See base class."""
    lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "dev-%d" % (i)
      language = tokenization.convert_to_unicode(line[0])
      if language != tokenization.convert_to_unicode(self.language):
        continue
      text_a = tokenization.convert_to_unicode(line[6])
      text_b = tokenization.convert_to_unicode(line[7])
      label = tokenization.convert_to_unicode(line[1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
Example #18
Source File: run_classifier.py    From models with Apache License 2.0 6 votes vote down vote up
def get_train_examples(self, data_dir):
    """See base class."""
    lines = self._read_tsv(
        os.path.join(data_dir, "multinli",
                     "multinli.train.%s.tsv" % self.language))
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "train-%d" % (i)
      text_a = tokenization.convert_to_unicode(line[0])
      text_b = tokenization.convert_to_unicode(line[1])
      label = tokenization.convert_to_unicode(line[2])
      if label == tokenization.convert_to_unicode("contradictory"):
        label = tokenization.convert_to_unicode("contradiction")
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
Example #19
Source File: extract_features.py    From models with Apache License 2.0 6 votes vote down vote up
def read_examples(input_file):
  """Read a list of `InputExample`s from an input file."""
  examples = []
  unique_id = 0
  with tf.io.gfile.GFile(input_file, "r") as reader:
    while True:
      line = tokenization.convert_to_unicode(reader.readline())
      if not line:
        break
      line = line.strip()
      text_a = None
      text_b = None
      m = re.match(r"^(.*) \|\|\| (.*)$", line)
      if m is None:
        text_a = line
      else:
        text_a = m.group(1)
        text_b = m.group(2)
      examples.append(
          InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
      unique_id += 1
  return examples 
Example #20
Source File: run_classifier.py    From models with Apache License 2.0 6 votes vote down vote up
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      # Only the test set has a header
      if set_type == "test" and i == 0:
        continue
      guid = "%s-%s" % (set_type, i)
      if set_type == "test":
        text_a = tokenization.convert_to_unicode(line[1])
        label = "0"
      else:
        text_a = tokenization.convert_to_unicode(line[3])
        label = tokenization.convert_to_unicode(line[1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples 
Example #21
Source File: run_classifier.py    From models with Apache License 2.0 6 votes vote down vote up
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
      text_a = tokenization.convert_to_unicode(line[8])
      text_b = tokenization.convert_to_unicode(line[9])
      if set_type == "test":
        label = "contradiction"
      else:
        label = tokenization.convert_to_unicode(line[-1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
Example #22
Source File: run_classifier.py    From models with Apache License 2.0 6 votes vote down vote up
def get_dev_examples(self, data_dir):
    """See base class."""
    lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "dev-%d" % (i)
      language = tokenization.convert_to_unicode(line[0])
      if language != tokenization.convert_to_unicode(self.language):
        continue
      text_a = tokenization.convert_to_unicode(line[6])
      text_b = tokenization.convert_to_unicode(line[7])
      label = tokenization.convert_to_unicode(line[1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
Example #23
Source File: run_classifier.py    From models with Apache License 2.0 6 votes vote down vote up
def get_train_examples(self, data_dir):
    """See base class."""
    lines = self._read_tsv(
        os.path.join(data_dir, "multinli",
                     "multinli.train.%s.tsv" % self.language))
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "train-%d" % (i)
      text_a = tokenization.convert_to_unicode(line[0])
      text_b = tokenization.convert_to_unicode(line[1])
      label = tokenization.convert_to_unicode(line[2])
      if label == tokenization.convert_to_unicode("contradictory"):
        label = tokenization.convert_to_unicode("contradiction")
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
Example #24
Source File: extract_features.py    From models with Apache License 2.0 6 votes vote down vote up
def read_examples(input_file):
  """Read a list of `InputExample`s from an input file."""
  examples = []
  unique_id = 0
  with tf.io.gfile.GFile(input_file, "r") as reader:
    while True:
      line = tokenization.convert_to_unicode(reader.readline())
      if not line:
        break
      line = line.strip()
      text_a = None
      text_b = None
      m = re.match(r"^(.*) \|\|\| (.*)$", line)
      if m is None:
        text_a = line
      else:
        text_a = m.group(1)
        text_b = m.group(2)
      examples.append(
          InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
      unique_id += 1
  return examples 
Example #25
Source File: run_classifier.py    From BERT with Apache License 2.0 6 votes vote down vote up
def get_dev_examples(self, data_dir):
    """See base class."""
    lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "dev-%d" % (i)
      language = tokenization.convert_to_unicode(line[0])
      if language != tokenization.convert_to_unicode(self.language):
        continue
      text_a = tokenization.convert_to_unicode(line[6])
      text_b = tokenization.convert_to_unicode(line[7])
      label = tokenization.convert_to_unicode(line[1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
Example #26
Source File: run_classifier.py    From BERT with Apache License 2.0 6 votes vote down vote up
def get_train_examples(self, data_dir):
    """See base class."""
    lines = self._read_tsv(
        os.path.join(data_dir, "multinli",
                     "multinli.train.%s.tsv" % self.language))
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "train-%d" % (i)
      text_a = tokenization.convert_to_unicode(line[0])
      text_b = tokenization.convert_to_unicode(line[1])
      label = tokenization.convert_to_unicode(line[2])
      if label == tokenization.convert_to_unicode("contradictory"):
        label = tokenization.convert_to_unicode("contradiction")
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
Example #27
Source File: extract_features.py    From BERT with Apache License 2.0 6 votes vote down vote up
def read_examples(input_file):
  """Read a list of `InputExample`s from an input file."""
  examples = []
  unique_id = 0
  with tf.gfile.GFile(input_file, "r") as reader:
    while True:
      line = tokenization.convert_to_unicode(reader.readline())
      if not line:
        break
      line = line.strip()
      text_a = None
      text_b = None
      m = re.match(r"^(.*) \|\|\| (.*)$", line)
      if m is None:
        text_a = line
      else:
        text_a = m.group(1)
        text_b = m.group(2)
      examples.append(
          InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
      unique_id += 1
  return examples 
Example #28
Source File: run_classifier.py    From wsdm19cup with MIT License 6 votes vote down vote up
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      # Only the test set has a header
      if set_type == "test" and i == 0:
        continue
      guid = "%s-%s" % (set_type, i)
      if set_type == "test":
        text_a = tokenization.convert_to_unicode(line[1])
        label = "0"
      else:
        text_a = tokenization.convert_to_unicode(line[3])
        label = tokenization.convert_to_unicode(line[1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples 
Example #29
Source File: run_classifier.py    From wsdm19cup with MIT License 6 votes vote down vote up
def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
      text_a = tokenization.convert_to_unicode(line[8])
      text_b = tokenization.convert_to_unicode(line[9])
      if set_type == "test":
        label = "contradiction"
      else:
        label = tokenization.convert_to_unicode(line[-1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples 
Example #30
Source File: run_classifier.py    From wsdm19cup with MIT License 6 votes vote down vote up
def get_test_examples(self, data_dir):
    """See base class."""
    lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv"))
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
      guid = "test-%d" % (i)
      language = tokenization.convert_to_unicode(line[0])
      if language != tokenization.convert_to_unicode(self.language):
        continue
      text_a = tokenization.convert_to_unicode(line[6])
      text_b = tokenization.convert_to_unicode(line[7])
      label = tokenization.convert_to_unicode(line[1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples