# Model description - Morphosyntactic analyzer: Stanza - Tagset: UD - Embedding vectors: Fasttext (wiki) - Dataset: PDB (http://git.nlp.ipipan.waw.pl/alina/PDBUD/tree/master/PDB-UD/PDB-UD) # How to use ## Clone ``` git clone git@hf.co:ipipan/nlpre_stanza_ud_fasttext_pdb ``` ## Load model ``` import stanza lang = 'pl' model_name = 'nlpre_stanza_ud_fasttext_pdb' prefix = 'pdb1809' config = \ { # Comma-separated list of processors to use 'processors': 'tokenize,mwt,pos,lemma', # Language code for the language to build the Pipeline in 'lang': lang, # Processor-specific arguments are set with keys "{processor_name}_{argument_name}" # You only need model paths if you have a specific model outside of stanza_resources 'tokenize_model_path': os.path.join(model_name, f'{lang}_{prefix}_tokenizer.pt'), 'mwt_model_path': os.path.join(model_name, f'{lang}_{prefix}_mwt_expander.pt'), 'pos_model_path': os.path.join(model_name, f'{lang}_{prefix}_tagger.pt'), 'pos_pretrain_path': os.path.join(model_name, f'{lang}_{prefix}.pretrain.pt'), 'lemma_model_path': os.path.join(model_name, f'{lang}_{prefix}_lemmatizer.pt'), # Use pretokenized text as input and disable tokenization 'tokenize_pretokenized': True } model = stanza.Pipeline(**config)