I might be working on a tutorial on this project, so I figured I'd double-check explicitly: are multi-token phrases supported? My impression is that they're not, and that's totally fine, but I just wanted to make sure.
import spacy
from spacy import displacy
import concise_concepts
data = {
"fruit": ["apple", "pear", "orange"],
"vegetable": ["broccoli", "spinach", "tomato"],
"meat": ["beef", "pork", "fish", "lamb"],
"utensil": ["large oven", "warm stove", "big knife"]
}
text = """
Heat the oil in a large pan and add the Onion, celery and carrots.
Then, cook over a medium–low heat for 10 minutes, or until softened.
Add the courgette, garlic, red peppers and oregano and cook for 2–3 minutes.
Later, add some oranges and chickens. """
nlp = spacy.load("en_core_web_lg", disable=["ner"])
# ent_score for entity condifence scoring
nlp.add_pipe("concise_concepts", config={"data": data, "ent_score": True})
doc = nlp(text)
options = {"colors": {"fruit": "darkorange", "vegetable": "limegreen", "meat": "salmon", "utensil": "gray"},
"ents": ["fruit", "vegetable", "meat", "utensil"]}
ents = doc.ents
for ent in ents:
new_label = f"{ent.label_} ({float(ent._.ent_score):.0%})"
options["colors"][new_label] = options["colors"].get(ent.label_.lower(), None)
options["ents"].append(new_label)
ent.label_ = new_label
doc.ents = ents
displacy.render(doc, style="ent", options=options)
word ´large oven´ from key ´utensil´ not present in vector model
word ´warm stove´ from key ´utensil´ not present in vector model
word ´big knife´ from key ´utensil´ not present in vector model
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
Input In [4], in <cell line: 21>()
18 nlp = spacy.load("en_core_web_lg", disable=["ner"])
20 # ent_score for entity condifence scoring
---> 21 nlp.add_pipe("concise_concepts", config={"data": data, "ent_score": True})
22 doc = nlp(text)
24 options = {"colors": {"fruit": "darkorange", "vegetable": "limegreen", "meat": "salmon", "utensil": "gray"},
25 "ents": ["fruit", "vegetable", "meat", "utensil"]}
File ~/Development/prodigy-demos/venv/lib/python3.8/site-packages/spacy/language.py:795, in Language.add_pipe(self, factory_name, name, before, after, first, last, source, config, raw_config, validate)
787 if not self.has_factory(factory_name):
788 err = Errors.E002.format(
789 name=factory_name,
790 opts=", ".join(self.factory_names),
(...)
793 lang_code=self.lang,
794 )
--> 795 pipe_component = self.create_pipe(
796 factory_name,
797 name=name,
798 config=config,
799 raw_config=raw_config,
800 validate=validate,
801 )
802 pipe_index = self._get_pipe_index(before, after, first, last)
803 self._pipe_meta[name] = self.get_factory_meta(factory_name)
File ~/Development/prodigy-demos/venv/lib/python3.8/site-packages/spacy/language.py:674, in Language.create_pipe(self, factory_name, name, config, raw_config, validate)
671 cfg = {factory_name: config}
672 # We're calling the internal _fill here to avoid constructing the
673 # registered functions twice
--> 674 resolved = registry.resolve(cfg, validate=validate)
675 filled = registry.fill({"cfg": cfg[factory_name]}, validate=validate)["cfg"]
676 filled = Config(filled)
File ~/Development/prodigy-demos/venv/lib/python3.8/site-packages/thinc/config.py:747, in registry.resolve(cls, config, schema, overrides, validate)
738 @classmethod
739 def resolve(
740 cls,
(...)
745 validate: bool = True,
746 ) -> Dict[str, Any]:
--> 747 resolved, _ = cls._make(
748 config, schema=schema, overrides=overrides, validate=validate, resolve=True
749 )
750 return resolved
File ~/Development/prodigy-demos/venv/lib/python3.8/site-packages/thinc/config.py:796, in registry._make(cls, config, schema, overrides, resolve, validate)
794 if not is_interpolated:
795 config = Config(orig_config).interpolate()
--> 796 filled, _, resolved = cls._fill(
797 config, schema, validate=validate, overrides=overrides, resolve=resolve
798 )
799 filled = Config(filled, section_order=section_order)
800 # Check that overrides didn't include invalid properties not in config
File ~/Development/prodigy-demos/venv/lib/python3.8/site-packages/thinc/config.py:868, in registry._fill(cls, config, schema, validate, resolve, parent, overrides)
865 getter = cls.get(reg_name, func_name)
866 # We don't want to try/except this and raise our own error
867 # here, because we want the traceback if the function fails.
--> 868 getter_result = getter(*args, **kwargs)
869 else:
870 # We're not resolving and calling the function, so replace
871 # the getter_result with a Promise class
872 getter_result = Promise(
873 registry=reg_name, name=func_name, args=args, kwargs=kwargs
874 )
File ~/Development/prodigy-demos/venv/lib/python3.8/site-packages/concise_concepts/__init__.py:47, in make_concise_concepts(nlp, name, data, topn, model_path, word_delimiter, ent_score, exclude_pos, exclude_dep, include_compound_words, case_sensitive)
9 @Language.factory(
10 "concise_concepts",
11 default_config={
(...)
45 case_sensitive: bool,
46 ):
---> 47 return Conceptualizer(
48 nlp=nlp,
49 name=name,
50 data=data,
51 topn=topn,
52 model_path=model_path,
53 word_delimiter=word_delimiter,
54 ent_score=ent_score,
55 exclude_pos=exclude_pos,
56 exclude_dep=exclude_dep,
57 include_compound_words=include_compound_words,
58 case_sensitive=case_sensitive,
59 )
File ~/Development/prodigy-demos/venv/lib/python3.8/site-packages/concise_concepts/conceptualizer/Conceptualizer.py:95, in Conceptualizer.__init__(self, nlp, name, data, topn, model_path, word_delimiter, ent_score, exclude_pos, exclude_dep, include_compound_words, case_sensitive)
93 else:
94 self.match_key = "LEMMA"
---> 95 self.run()
96 self.data_upper = {k.upper(): v for k, v in data.items()}
File ~/Development/prodigy-demos/venv/lib/python3.8/site-packages/concise_concepts/conceptualizer/Conceptualizer.py:101, in Conceptualizer.run(self)
99 self.determine_topn()
100 self.set_gensim_model()
--> 101 self.verify_data()
102 self.expand_concepts()
103 self.verify_data(verbose=False)
File ~/Development/prodigy-demos/venv/lib/python3.8/site-packages/concise_concepts/conceptualizer/Conceptualizer.py:193, in Conceptualizer.verify_data(self, verbose)
188 logger.warning(
189 f"word ´{word}´ from key ´{key}´ not present in vector"
190 " model"
191 )
192 verified_data[key] = verified_values
--> 193 assert len(
194 verified_values
195 ), f"None of the entries for key {key} are present in the vector model"
196 self.data = deepcopy(verified_data)
197 self.original_data = deepcopy(self.data)
AssertionError: None of the entries for key utensil are present in the vector model