KyrgyzNER model (xlm-roberta-base)

kyrgyzNER model

The original repository: https://github.com/Akyl-AI/KyrgyzNER
Paper will be uploaded soon
KyrgyzNER dataset and Codes will be uploaded soon

This model is a fine-tuned version of xlm-roberta-base on the KyrgyzNER dataset. It achieves the following results on the evaluation set:

Loss: 0.3273
Precision: 0.7090
Recall: 0.6946
F1: 0.7017
Accuracy: 0.9119

How to use

You can use this model with the Transformers pipeline for NER.

from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
from transformers import pipeline

id2label = {
'LABEL_0': 'B-NATIONAL',
 'LABEL_1': 'I-PLANT',
 'LABEL_2': 'I-ORGANISATION',
 'LABEL_3': 'B-ORGANISATION',
 'LABEL_4': 'B-MEDIA',
 'LABEL_5': 'I-ARTIFACT',
 'LABEL_6': 'B-AWARD',
 'LABEL_7': 'B-UNKNOWN',
 'LABEL_8': 'I-LOCATION',
 'LABEL_9': 'B-PERSON',
 'LABEL_10': 'I-LEGAL',
 'LABEL_11': 'B-BUSINESS',
 'LABEL_12': 'B-ACRONYM',
 'LABEL_13': 'I-PERIOD',
 'LABEL_14': 'B-INSTITUTION',
 'LABEL_15': 'I-MEASURE',
 'LABEL_16': 'B-CREATION',
 'LABEL_17': 'I-ACRONYM',
 'LABEL_18': 'I-AWARD',
 'LABEL_19': 'I-WEBSITE',
 'LABEL_20': 'B-PERIOD',
 'LABEL_21': 'I-PERSON',
 'LABEL_22': 'I-PERSON_TYPE',
 'LABEL_23': 'B-SUBSTANCE',
 'LABEL_24': 'O',
 'LABEL_25': 'B-PLANT',
 'LABEL_26': 'I-INSTITUTION',
 'LABEL_27': 'I-SUBSTANCE',
 'LABEL_28': 'I-INSTALLATION',
 'LABEL_29': 'B-CONCEPT',
 'LABEL_30': 'B-TITLE',
 'LABEL_31': 'I-EVENT',
 'LABEL_32': 'B-ARTIFACT',
 'LABEL_33': 'B-MEASURE',
 'LABEL_34': 'B-LOCATION',
 'LABEL_35': 'I-BUSINESS',
 'LABEL_36': 'B-ANIMAL',
 'LABEL_37': 'B-PERSON_TYPE',
 'LABEL_38': 'B-INSTALLATION',
 'LABEL_39': 'I-TITLE',
 'LABEL_40': 'B-IDENTIFIER',
 'LABEL_41': 'I-IDENTIFIER',
 'LABEL_42': 'B-LEGAL',
 'LABEL_43': 'I-MEDIA',
 'LABEL_44': 'I-CONCEPT',
 'LABEL_45': 'I-UNKNOWN',
 'LABEL_46': 'B-EVENT',
 'LABEL_47': 'B-WEBSITE',
 'LABEL_48': 'I-NATIONAL',
 'LABEL_49': 'I-CREATION',
 'LABEL_50': 'I-ANIMAL'}

model_ckpt = "TTimur/xlm-roberta-base-kyrgyzNER"

config = AutoConfig.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForTokenClassification.from_pretrained(model_ckpt, config = config)

# aggregation_strategy = "none"
nlp = pipeline("ner", model = model, tokenizer = tokenizer, aggregation_strategy = "none")

example = "Кыргызстан Орто Азиянын түндүк-чыгышында орун алган мамлекет."
ner_results = nlp(example)
for result in ner_results:
    result.update({'entity': id2label[result['entity']]})
    print(result)
    
# output:
# {'entity': 'B-LOCATION', 'score': 0.95103735, 'index': 1, 'word': '▁Кыргызстан', 'start': 0, 'end': 10}
# {'entity': 'B-LOCATION', 'score': 0.79447913, 'index': 2, 'word': '▁Ор', 'start': 11, 'end': 13}
# {'entity': 'I-LOCATION', 'score': 0.8703734, 'index': 3, 'word': 'то', 'start': 13, 'end': 15}
# {'entity': 'I-LOCATION', 'score': 0.942387, 'index': 4, 'word': '▁Азия', 'start': 16, 'end': 20}
# {'entity': 'I-LOCATION', 'score': 0.8542615, 'index': 5, 'word': 'нын', 'start': 20, 'end': 23}
# {'entity': 'I-LOCATION', 'score': 0.70930535, 'index': 6, 'word': '▁түн', 'start': 24, 'end': 27}
# {'entity': 'I-LOCATION', 'score': 0.6540094, 'index': 7, 'word': 'дүк', 'start': 27, 'end': 30}
# {'entity': 'I-LOCATION', 'score': 0.63446337, 'index': 8, 'word': '-', 'start': 30, 'end': 31}
# {'entity': 'I-LOCATION', 'score': 0.6204858, 'index': 9, 'word': 'чы', 'start': 31, 'end': 33}
# {'entity': 'I-LOCATION', 'score': 0.6786872, 'index': 10, 'word': 'г', 'start': 33, 'end': 34}
# {'entity': 'I-LOCATION', 'score': 0.64190257, 'index': 11, 'word': 'ыш', 'start': 34, 'end': 36}
# {'entity': 'O', 'score': 0.64438057, 'index': 12, 'word': 'ында', 'start': 36, 'end': 40}
# {'entity': 'O', 'score': 0.9916931, 'index': 13, 'word': '▁орун', 'start': 41, 'end': 45}
# {'entity': 'O', 'score': 0.9953047, 'index': 14, 'word': '▁алган', 'start': 46, 'end': 51}
# {'entity': 'O', 'score': 0.9901377, 'index': 15, 'word': '▁мамлекет', 'start': 52, 'end': 60}
# {'entity': 'O', 'score': 0.99605453, 'index': 16, 'word': '.', 'start': 60, 'end': 61}


token = ""
label_list = []
token_list = []

for result in ner_results:
    if result["word"].startswith("▁"):
        if token:
            token_list.append(token.replace("▁", ""))
        token = result["word"]
        label_list.append(result["entity"])
    else:
        token += result["word"]

token_list.append(token.replace("▁", ""))

for token, label in zip(token_list, label_list):
    print(f"{token}\t{label}")


# output:
# Кыргызстан    B-LOCATION
# Орто  B-LOCATION
# Азиянын   I-LOCATION
# түндүк-чыгышында  I-LOCATION
# орун  O
# алган O
# мамлекет. O

# aggregation_strategy = "simple"
nlp = pipeline("ner", model = model, tokenizer = tokenizer, aggregation_strategy = "simple")
example = "Кыргызстан Орто Азиянын түндүк-чыгышында орун алган мамлекет."

ner_results = nlp(example)
for result in ner_results:
    result.update({'entity_group': id2label[result['entity_group']]})
    print(result)

# output:
# {'entity_group': 'B-LOCATION', 'score': 0.87275827, 'word': 'Кыргызстан Ор', 'start': 0, 'end': 13}
# {'entity_group': 'I-LOCATION', 'score': 0.73398614, 'word': 'то Азиянын түндүк-чыгыш', 'start': 13, 'end': 36}
# {'entity_group': 'O', 'score': 0.92351407, 'word': 'ында орун алган мамлекет.', 'start': 36, 'end': 61}

NE classes

PERSON, LOCATION , MEASURE , INSTITUTION , PERIOD , ORGANISATION , MEDIA , TITLE , BUSINESS , LEGAL , EVENT , ARTIFACT , INSTALLATION , PERSON_TYPE, NATIONAL, CONCEPT, CREATION, WEBSITE, SUBSTANCE, ACRONYM, IDENTIFIER, UNKNOWN, AWARD, ANIMAL

You can download model on HuggingFace.