Python spacy.gold() Examples

The following are 4 code examples of spacy.gold(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module spacy , or try the search function

Example #1

Source File: crf_entity_extractor.py From Rasa_NLU_Chi with Apache License 2.0

7 votes

def _from_json_to_crf(self,
                          message,  # type: Message
                          entity_offsets  # type: List[Tuple[int, int, Text]]
                          ):
        # type: (...) -> List[Tuple[Text, Text, Text, Text]]
        """Convert json examples to format of underlying crfsuite."""
        from spacy.gold import GoldParse

        doc = message.get("spacy_doc")
        gold = GoldParse(doc, entities=entity_offsets)
        ents = [l[5] for l in gold.orig_annot]
        if '-' in ents:
            logger.warn("Misaligned entity annotation in sentence '{}'. "
                        "Make sure the start and end values of the "
                        "annotated training examples end at token "
                        "boundaries (e.g. don't include trailing "
                        "whitespaces).".format(doc.text))
        if not self.component_config["BILOU_flag"]:
            for i, label in enumerate(ents):
                if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
                    # removes BILOU prefix from label
                    ents[i] = self._entity_from_label(label)

        return self._from_text_to_crf(message, ents)

Example #2

Source File: CRF.py From NLU with MIT License

6 votes

def jsonToCrf(self, json_eg, spacy_nlp):
        
        entity_offsets = []

        for sentence in json_eg['text']:

            doc = spacy_nlp(sentence)

            for i in json_eg['entities']:
                
                entity_offsets.append(tuple((i['rangeFrom'],i['rangeTo'],i['entity'])))
                
            gold = GoldParse(doc, entities=entity_offsets)
            ents = [l[5] for l in gold.orig_annot]
            crf_format = [(doc[i].text, doc[i].tag_, ents[i]) for i in range(len(doc))]
        
        return crf_format

Example #3

Source File: crf_entity_extractor.py From rasa_nlu with Apache License 2.0

5 votes

def _from_json_to_crf(self,
                          message: Message,
                          entity_offsets: List[Tuple[int, int, Text]]
                          ) -> List[Tuple[Text, Text, Text, Text]]:
        """Convert json examples to format of underlying crfsuite."""

        if self.pos_features:
            from spacy.gold import GoldParse

            doc = message.get("spacy_doc")
            gold = GoldParse(doc, entities=entity_offsets)
            ents = [l[5] for l in gold.orig_annot]
        else:
            tokens = message.get("tokens")
            ents = self._bilou_tags_from_offsets(tokens, entity_offsets)

        if '-' in ents:
            logger.warning("Misaligned entity annotation in sentence '{}'. "
                           "Make sure the start and end values of the "
                           "annotated training examples end at token "
                           "boundaries (e.g. don't include trailing "
                           "whitespaces or punctuation)."
                           "".format(message.text))
        if not self.component_config["BILOU_flag"]:
            for i, label in enumerate(ents):
                if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
                    # removes BILOU prefix from label
                    ents[i] = self._entity_from_label(label)

        return self._from_text_to_crf(message, ents)

Example #4

Source File: CRF.py From NLU with MIT License

5 votes

def createDataset(self, intents, spacy_nlp):
        
        dataset = []
        entity_offsets = []
        
        intentCounter = 0
        for intent in intents:
            
            sentenceCounter = 0
            for sentence in intent['text']:
                
                doc = spacy_nlp(sentence)
                print(doc.text)

                for entity in intent['entities'][sentenceCounter]:
                    
                    entity_offsets.append(tuple((entity['rangeFrom'],entity['rangeTo'],entity['entity'])))
                
                gold = GoldParse(doc, entities=entity_offsets)
                ents = [l[5] for l in gold.orig_annot]
                crf_format = [(doc[entity].text, doc[entity].tag_, ents[entity]) for i in range(len(doc))]
                dataset.append(crf_format)
                sentenceCounter = sentenceCounter + 1
            
            intentCounter = intentCounter + 1

        return dataset