{"id":253,"date":"2023-08-03T04:10:43","date_gmt":"2023-08-03T04:10:43","guid":{"rendered":"https:\/\/babyai-hub.vizdata.tech\/?p=253"},"modified":"2024-03-28T03:37:37","modified_gmt":"2024-03-28T03:37:37","slug":"thai-named-entity-corpus-bkd-corpus_nlp-dataset","status":"publish","type":"post","link":"https:\/\/babyai-hub.vizdata.tech\/?p=253","title":{"rendered":"NLP : Thai Named Entity Corpus (BKD Corpus)"},"content":{"rendered":"\t\t<div data-elementor-type=\"wp-post\" data-elementor-id=\"253\" class=\"elementor elementor-253\" data-elementor-post-type=\"post\">\n\t\t\t\t\t\t\t<div class=\"elementor-element elementor-element-a30db30 e-flex e-con-boxed e-con e-parent\" data-id=\"a30db30\" data-element_type=\"container\" data-settings=\"{&quot;content_width&quot;:&quot;boxed&quot;}\" data-core-v316-plus=\"true\">\n\t\t\t\t\t<div class=\"e-con-inner\">\n\t\t\t\t<div class=\"elementor-element elementor-element-cdc7c84 elementor-widget elementor-widget-heading\" data-id=\"cdc7c84\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<style>\/*! elementor - v3.16.0 - 23-08-2023 *\/\n.elementor-heading-title{padding:0;margin:0;line-height:1}.elementor-widget-heading .elementor-heading-title[class*=elementor-size-]>a{color:inherit;font-size:inherit;line-height:inherit}.elementor-widget-heading .elementor-heading-title.elementor-size-small{font-size:15px}.elementor-widget-heading .elementor-heading-title.elementor-size-medium{font-size:19px}.elementor-widget-heading .elementor-heading-title.elementor-size-large{font-size:29px}.elementor-widget-heading .elementor-heading-title.elementor-size-xl{font-size:39px}.elementor-widget-heading .elementor-heading-title.elementor-size-xxl{font-size:59px}<\/style><h2 class=\"elementor-heading-title elementor-size-default\"><b>Thai Named Entity Corpus (BKD Corpus)<\/b><\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-8d9ef40 elementor-widget elementor-widget-heading\" data-id=\"8d9ef40\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\"><b>NPL Project<\/b><\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-c6474b7 elementor-widget elementor-widget-spacer\" data-id=\"c6474b7\" data-element_type=\"widget\" data-widget_type=\"spacer.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<style>\/*! elementor - v3.16.0 - 23-08-2023 *\/\n.elementor-column .elementor-spacer-inner{height:var(--spacer-size)}.e-con{--container-widget-width:100%}.e-con-inner>.elementor-widget-spacer,.e-con>.elementor-widget-spacer{width:var(--container-widget-width,var(--spacer-size));--align-self:var(--container-widget-align-self,initial);--flex-shrink:0}.e-con-inner>.elementor-widget-spacer>.elementor-widget-container,.e-con>.elementor-widget-spacer>.elementor-widget-container{height:100%;width:100%}.e-con-inner>.elementor-widget-spacer>.elementor-widget-container>.elementor-spacer,.e-con>.elementor-widget-spacer>.elementor-widget-container>.elementor-spacer{height:100%}.e-con-inner>.elementor-widget-spacer>.elementor-widget-container>.elementor-spacer>.elementor-spacer-inner,.e-con>.elementor-widget-spacer>.elementor-widget-container>.elementor-spacer>.elementor-spacer-inner{height:var(--container-widget-height,var(--spacer-size))}.e-con-inner>.elementor-widget-spacer.elementor-widget-empty,.e-con>.elementor-widget-spacer.elementor-widget-empty{position:relative;min-height:22px;min-width:22px}.e-con-inner>.elementor-widget-spacer.elementor-widget-empty .elementor-widget-empty-icon,.e-con>.elementor-widget-spacer.elementor-widget-empty .elementor-widget-empty-icon{position:absolute;top:0;bottom:0;left:0;right:0;margin:auto;padding:0;width:22px;height:22px}<\/style>\t\t<div class=\"elementor-spacer\">\n\t\t\t<div class=\"elementor-spacer-inner\"><\/div>\n\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t<div class=\"elementor-element elementor-element-5b6dc4c e-flex e-con-boxed e-con e-parent\" data-id=\"5b6dc4c\" data-element_type=\"container\" data-settings=\"{&quot;content_width&quot;:&quot;boxed&quot;}\" data-core-v316-plus=\"true\">\n\t\t\t\t\t<div class=\"e-con-inner\">\n\t\t\t\t<div class=\"elementor-element elementor-element-1b26c38 elementor-widget-divider--view-line_text elementor-widget__width-initial elementor-widget-divider--separator-type-pattern elementor-widget-divider--element-align-center elementor-widget elementor-widget-divider\" data-id=\"1b26c38\" data-element_type=\"widget\" data-widget_type=\"divider.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<style>\/*! elementor - v3.16.0 - 23-08-2023 *\/\n.elementor-widget-divider{--divider-border-style:none;--divider-border-width:1px;--divider-color:#0c0d0e;--divider-icon-size:20px;--divider-element-spacing:10px;--divider-pattern-height:24px;--divider-pattern-size:20px;--divider-pattern-url:none;--divider-pattern-repeat:repeat-x}.elementor-widget-divider .elementor-divider{display:flex}.elementor-widget-divider .elementor-divider__text{font-size:15px;line-height:1;max-width:95%}.elementor-widget-divider .elementor-divider__element{margin:0 var(--divider-element-spacing);flex-shrink:0}.elementor-widget-divider .elementor-icon{font-size:var(--divider-icon-size)}.elementor-widget-divider .elementor-divider-separator{display:flex;margin:0;direction:ltr}.elementor-widget-divider--view-line_icon .elementor-divider-separator,.elementor-widget-divider--view-line_text .elementor-divider-separator{align-items:center}.elementor-widget-divider--view-line_icon .elementor-divider-separator:after,.elementor-widget-divider--view-line_icon .elementor-divider-separator:before,.elementor-widget-divider--view-line_text .elementor-divider-separator:after,.elementor-widget-divider--view-line_text .elementor-divider-separator:before{display:block;content:\"\";border-bottom:0;flex-grow:1;border-top:var(--divider-border-width) var(--divider-border-style) var(--divider-color)}.elementor-widget-divider--element-align-left .elementor-divider .elementor-divider-separator>.elementor-divider__svg:first-of-type{flex-grow:0;flex-shrink:100}.elementor-widget-divider--element-align-left .elementor-divider-separator:before{content:none}.elementor-widget-divider--element-align-left .elementor-divider__element{margin-left:0}.elementor-widget-divider--element-align-right .elementor-divider .elementor-divider-separator>.elementor-divider__svg:last-of-type{flex-grow:0;flex-shrink:100}.elementor-widget-divider--element-align-right .elementor-divider-separator:after{content:none}.elementor-widget-divider--element-align-right .elementor-divider__element{margin-right:0}.elementor-widget-divider:not(.elementor-widget-divider--view-line_text):not(.elementor-widget-divider--view-line_icon) .elementor-divider-separator{border-top:var(--divider-border-width) var(--divider-border-style) var(--divider-color)}.elementor-widget-divider--separator-type-pattern{--divider-border-style:none}.elementor-widget-divider--separator-type-pattern.elementor-widget-divider--view-line .elementor-divider-separator,.elementor-widget-divider--separator-type-pattern:not(.elementor-widget-divider--view-line) .elementor-divider-separator:after,.elementor-widget-divider--separator-type-pattern:not(.elementor-widget-divider--view-line) .elementor-divider-separator:before,.elementor-widget-divider--separator-type-pattern:not([class*=elementor-widget-divider--view]) .elementor-divider-separator{width:100%;min-height:var(--divider-pattern-height);-webkit-mask-size:var(--divider-pattern-size) 100%;mask-size:var(--divider-pattern-size) 100%;-webkit-mask-repeat:var(--divider-pattern-repeat);mask-repeat:var(--divider-pattern-repeat);background-color:var(--divider-color);-webkit-mask-image:var(--divider-pattern-url);mask-image:var(--divider-pattern-url)}.elementor-widget-divider--no-spacing{--divider-pattern-size:auto}.elementor-widget-divider--bg-round{--divider-pattern-repeat:round}.rtl .elementor-widget-divider .elementor-divider__text{direction:rtl}.e-con-inner>.elementor-widget-divider,.e-con>.elementor-widget-divider{width:var(--container-widget-width,100%);--flex-grow:var(--container-widget-flex-grow)}<\/style>\t\t<div class=\"elementor-divider\" style=\"--divider-pattern-url: url(&quot;data:image\/svg+xml,%3Csvg xmlns=&#039;http:\/\/www.w3.org\/2000\/svg&#039; preserveAspectRatio=&#039;none&#039; overflow=&#039;visible&#039; height=&#039;100%&#039; viewBox=&#039;0 0 20 16&#039; fill=&#039;none&#039; stroke=&#039;black&#039; stroke-width=&#039;1&#039; stroke-linecap=&#039;square&#039; stroke-miterlimit=&#039;10&#039;%3E%3Cg transform=&#039;translate(-12.000000, 0)&#039;%3E%3Cpath d=&#039;M28,0L10,18&#039;\/%3E%3Cpath d=&#039;M18,0L0,18&#039;\/%3E%3Cpath d=&#039;M48,0L30,18&#039;\/%3E%3Cpath d=&#039;M38,0L20,18&#039;\/%3E%3C\/g%3E%3C\/svg%3E&quot;);\">\n\t\t\t<span class=\"elementor-divider-separator\">\n\t\t\t\t\t\t\t<h1 class=\"elementor-divider__text elementor-divider__element\">\n\t\t\t\tDISCRIPTION\t\t\t\t<\/h1>\n\t\t\t\t\t\t<\/span>\n\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-3d2ab81 elementor-widget elementor-widget-spacer\" data-id=\"3d2ab81\" data-element_type=\"widget\" data-widget_type=\"spacer.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<div class=\"elementor-spacer\">\n\t\t\t<div class=\"elementor-spacer-inner\"><\/div>\n\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t<div class=\"elementor-element elementor-element-fdeaef0 e-flex e-con-boxed e-con e-parent\" data-id=\"fdeaef0\" data-element_type=\"container\" data-settings=\"{&quot;content_width&quot;:&quot;boxed&quot;}\" data-core-v316-plus=\"true\">\n\t\t\t\t\t<div class=\"e-con-inner\">\n\t\t\t\t<div class=\"elementor-element elementor-element-7b1f5a2 elementor-widget elementor-widget-heading\" data-id=\"7b1f5a2\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\"><img decoding=\"async\" draggable=\"false\" role=\"img\" class=\"emoji\" alt=\"\u25b6\" src=\"https:\/\/s.w.org\/images\/core\/emoji\/14.0.0\/svg\/25b6.svg\"\/> <b>ABSTRACT<\/b><\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-343328e elementor-widget elementor-widget-heading\" data-id=\"343328e\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">The languages spoken in Asia share common morphological analysis errors in word segmentation which normally propagate to higher-level processing, i.e., part-of-speech (POS) tagging, syntactic parsing, word extraction, and named entity recognition (NER), as we discuss in this research. We introduce the Thai character cluster (TCC) to reduce the errors propagated from word segmentation and POS tagging by incorporating it into the character representation layer of bidirectional long short-term memory (BiLSTM) for NER. The initial NER model is created from the original THAI-NEST named-entity (NE) tagged corpus by applying the best performing BiLSTM-CNN-CRF model (the combination of BiLSTM, convolutional neural network (CNN), and conditional random field (CRF)) with the word, POS, and TCC embedding. We determine the errors and improve the consistency of the NE annotation through our holdout method by retraining the model with the corrected training set. After the iteration, the overall result of the annotation F1-score has been improved to reach 89.22%, which improves 16.21% from the model trained on the original corpus. The result of our iterative verification is a promising method for low resource language modeling. As a result, The NE silver standard corpus is newly generated for the Thai NER task, called Bangkok Data NE tagged Corpus (BKD). The consistency of annotation is checked and revised according to the improvement of the scope of NE detection by TCC which can recover the errors in word segmentation.<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-118316c elementor-widget elementor-widget-heading\" data-id=\"118316c\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">\u25b6<b> INDEX TERMS<\/b><\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-ebb21c9 elementor-widget elementor-widget-heading\" data-id=\"ebb21c9\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">Low-resource languages, named entity recognition, Thai language, Thai named entity<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-7f11207 elementor-widget elementor-widget-heading\" data-id=\"7f11207\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\"><img decoding=\"async\" draggable=\"false\" role=\"img\" class=\"emoji\" alt=\"\u25b6\" src=\"https:\/\/s.w.org\/images\/core\/emoji\/14.0.0\/svg\/25b6.svg\" style=\"font-family: Sarabun, sans-serif; font-size: 16px; font-style: normal; font-weight: 300; background-color: rgb(255, 255, 255);\"\/><span style=\"font-family: Sarabun, sans-serif; font-size: 16px; font-style: normal; font-weight: 700;\"> <\/span><b>INTRODUCTION<\/b><\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-8aef1cf elementor-widget elementor-widget-heading\" data-id=\"8aef1cf\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">This paper proposes a novel method of iterative NE tagging refinement that can be applied to a noisy NE corpus, to generate a silver standard Bangkok Data NE tagged Corpus (BKD) to solve the problem of the limited resources of the Thai language. Some difficulties of language processing in the fundamental issues are also a high barrier to overcome, to improve language processing. The Thai language is an alphabetic language, with no explicit word or sentence boundary, and it is an isolated language, without grammatical markers. These issues can induce a vast amount of ambiguities in morphosyntactic analysis. Therefore, the consistency problem in word segmentation and grammatical tag annotation is not trivial. The errors are always propagated to consecutive tasks<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-fddb6bc elementor-widget elementor-widget-heading\" data-id=\"fddb6bc\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">The associate editor coordinating the review of this manuscript and approving it for publication was Essam A. Rashed .<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-8a6aaaa elementor-widget elementor-widget-heading\" data-id=\"8a6aaaa\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">such as word dependency, parse tree annotation, word sense disambiguation, and certainly in the current task of namedentity (NE) annotation. Since word segmentation is not in the scope of this paper, we manually correct the result when necessary and apply the state-of-art word segmentation based on the trigram part-of-speech (POS) tagging model [17]. The POS tagset is introduced from the ORCHID POS tagged corpus [20].<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-5f3e657 elementor-widget elementor-widget-heading\" data-id=\"5f3e657\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">The THAI-NEST corpus [27] is currently the largest Thai NE corpus, collected from 21 Thai online newspaper publishers from January to December 2009. The collection contains a good balance in the variation of the text domain. In the corpus, word segmentation is applied and annotated with the POS tagset. On top of that, the seven types of NE tags, namely, date (DAT), location (LOC), measure (MEA), name (NAM), organization (ORG), person (PER), and time (TIM) are annotated to the corresponding words. The corpus is manually annotated with one type of NE for a particular file. The size of the corpus is significantly large. It is a corpus of approximately seven million words or about 80 thousand sentences, as shown in Table 1. However, it needs a proper data cleansing process, especially for the word segmentation errors and inconsistency in NE tagging, as we preliminarily conducted a consistency test on the corpus and found that the errors have a significant effect on the accuracy of the named entity recognition (NER) task.<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-d1658b8 elementor-widget elementor-widget-heading\" data-id=\"d1658b8\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">The accuracy of word segmentation has a strong impact on the quality of the corpus. The error normally propagates to higher-level processing in producing the features of word spelling and its POS labeling. To recover the errors, TCC is used instead of a character in many cases. [19] proposed TCC, the smallest standalone character unit according to the spelling rules, to represent the character to reduce the errors in determining the breakable positions in the string. For example, the next breakable position in the string after \u2018\u2018\u0e01\u0e23\u0e30\u0e17\u0e23\u0e27\u0e07\u0e01\u0e32\u0e23\u0e04\u2019\u2019 is \u2018\u0e01\u0e23\u0e30\u0e17\u0e23\u0e27\u0e07\u0e01\u0e32\u0e23\u0e04\u0e25\u0e31\u0e07\u2019\u2019, not \u2018\u2018\u0e01\u0e23\u0e30\u0e17\u0e23\u0e27\u0e07\u0e01\u0e32\u0e23\u0e04\u0e25\u2019\u2019 because the vowel sign \u2018\u2018 -\u0e31 \u2019\u2019 has to be combined with a base consonant like a diacritical sign. \u2018\u2018\u0e25\u0e31\u0e07\u2019 is called a character cluster or Thai Character Cluster (TCC). TCC can be defined by a set of spelling rules. There is no ambiguity in forming a cluster, therefore, there is no error in clustering and it can provide a better context to represent a character-level feature of a word.<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-9b34cfd elementor-widget elementor-widget-heading\" data-id=\"9b34cfd\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">To improve the performance of NER, especially for the non-segmented language such as Thai, we found that utilizing the advantages of TCC representation instead of a character in the character embedding layer of bidirectional long shortterm memory (BiLSTM) can mitigate the NER errors according to the inaccurate word segmentation results. The approach is also viable for other non-segmented languages having similar character composition clues such as Lao, Myanmar, and Cambodian. Though it is out of the scope of this research, the larger unit of character composition can reduce the perplexity at the character representation level.<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-2a4c115 elementor-widget elementor-widget-heading\" data-id=\"2a4c115\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">In this paper, we propose an efficient method to clean up a noisy corpus with language difficulties by state-ofthe-art NER using the combination of BiLSTM, convolutional neural network (CNN), and conditional random field (CRF) (BiLSTM-CNN-CRF) [13]. Our novel approach of applying the Thai character cluster (TCC) proposed by [18] for character-level representation in the character-embedding layer performs better in BiLSTM-CNN-CRF with POS and word embedding [22], [23].<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-f8fdfb8 elementor-widget elementor-widget-heading\" data-id=\"f8fdfb8\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">The main contribution of this research is to overcome the problem of the shortage and the quality of annotated corpus for model training and evaluation. The existing corpora, though there are not many, still have a big problem in the consistency of word segmentation and annotation. Using a noisy corpus for training certainly cannot expect a high precision out of the model. With the limitations of the availability of the NE annotated corpus, we propose an efficient method to refine the existing corpus though it is full of errors because developing a new large corpus is labor-intensive and costly.<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-c5bf855 elementor-widget elementor-widget-heading\" data-id=\"c5bf855\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">We refine the noisy corpus of THAI-NEST automatically to construct a so-called silver standard corpus (automatically constructed corpus with comparable quality to the gold standard corpus) [7] for the NER study.<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-5cc8ea7 elementor-widget elementor-widget-heading\" data-id=\"5cc8ea7\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">The paper is structured as follows. Section II provides the grounded works on NE dataset development and the proposed tagset for preparing the NE corpus. Section III summarizes the previous works on some effective approaches for the Thai language NER, and our proposed solution. Section IV gives the information of the THAI-NEST corpus which is used in this study. The size and the annotation scheme are elaborated. Section V discusses how CNN-TCC for characterlevel representation in the character embedding level can capture the NE spelling pattern to enhance the performance of BiLSTM-CNN-CRF for Thai NER. Section VI describes the performance and the comparison results of each model when applied to the same corpora. Section VII proposes a method of iterative NE tagging refinement to improve the existing noisy corpus and analysis results of the detected annotation errors. Section VIII concludes the achievement of our proposed approach of using TCC in character-level representation and applying the iterative refinement method to achieve the BKD silver standard Thai NE corpus.<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-aa31072 elementor-widget elementor-widget-heading\" data-id=\"aa31072\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">RELATED WORKS<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-f81c937 elementor-widget elementor-widget-heading\" data-id=\"f81c937\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">Many types of NE tagsets have been proposed. The types and number of tags are defined according to the groups and the tasks they are used in. The following are some examples of the representative tagsets used in the NER task: questionanswering, information extraction, text summarization, and machine translation.<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-b21e7ac elementor-widget elementor-widget-heading\" data-id=\"b21e7ac\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">\u2022 CONLL-2003, reported in NER shared task dataset [29], is a well-known collection of Reuters 1,393 newswire articles that contains a large portion of sports news. It is annotated with four entity types (person (PER), location (LOC), organization (ORG), and miscellaneous (MISC)).<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-840f519 elementor-widget elementor-widget-heading\" data-id=\"840f519\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">\u2022 MUC-6 [5] is a dataset consisting of newswire articles from the Wall Street Journal annotated with person (PER), location (LOC), organization (ORG), as well as several temporal and numerical entities.<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-bad1c12 elementor-widget elementor-widget-heading\" data-id=\"bad1c12\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">\u2022 OntoNotes 5.0 dataset [8] is annotated with 18 finegrained NE categories. Those categories are PERSON, NORP (Nationalities or religious or political groups), FACILITY, ORGANIZATION, GPE (Countries, cities, states), LOCATION, PRODUCT, EVENT, WORK OF ART, LAW, LANGUAGE, DATE, TIME, PERCENT, MONEY, QUANTITY, ORDINAL, and CARDINAL.<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-8ae99d3 elementor-widget elementor-widget-heading\" data-id=\"8ae99d3\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">\u2022 IJCNLP-08 NERSSEAL shared task tagset is a dataset consisting of 12 fine-grained NE tags.1 Those tags are person (NEP), designation (NED), organization (NEO), abbreviation (NEA), brand (NEB), title-person (NETP), title-object (NETO), location (NEL), time (NETI), number (NEN), measure (NEM), and terms (NETE).<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-b0d2792 elementor-widget elementor-widget-heading\" data-id=\"b0d2792\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">1 available at http:\/\/ltrc.iiit.ac.in\/ner-ssea-08<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-b93080a elementor-widget elementor-widget-heading\" data-id=\"b93080a\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">For the Thai language, there is a THAI-NEST corpus which is word-segmented and annotated with POS and seven types of NE tags, namely, date (DAT), location (LOC), measure (MEA), name (NAM), organization (ORG), person (PER), and time (TIM). The tagset is detailed enough for common tasks but due to Thai language difficulties in word segmentation and POS tagging, it is difficult to find common agreement in the annotation. These morphological errors cause difficulties in higher levels of NE annotation.<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-9cef3df elementor-widget elementor-widget-heading\" data-id=\"9cef3df\" data-element_type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">[12] has exhaustively surveyed NER research and classified the approaches into (i) Rule-based approach, which does not need annotated data as it relies on hand-crafted rules; (ii) Unsupervised learning approach, which relies on unsupervised algorithms without hand-tagged training examples; (iii) Feature-based supervised learning approach, which relies on supervised learning algorithms with careful feature engineering; (iv) Deep-learning-based approach, which automatically discovers representations needed for the classification and\/or detection from raw input in an end-to-end manner. The state-of-the-art NER in the deep-learning-based approach has been proposed by [13], BiLSTM-CNN-CRF. An experiment has been conducted on the CoNLL-2003 corpus, obtaining 91.21% F1 for the NER task.<\/h2>\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-d207ffc elementor-widget elementor-widget-read-more\" data-id=\"d207ffc\" data-element_type=\"widget\" data-widget_type=\"read-more.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t<\/div><\/div><\/div><\/div><\/div> <a href=\"https:\/\/babyai-hub.vizdata.tech\/?p=253#more-253\" class=\"more-link elementor-more-link\"><span aria-label=\"Continue reading NLP : Thai Named Entity Corpus (BKD Corpus)\">Reading more<\/span><\/a>","protected":false},"excerpt":{"rendered":"<p>Thai Named Entity Corpus (BKD Corpus) NPL Project DISCRIPTION ABSTRACT The languages spoken in Asia share common morphological analysis errors in word.<\/p>\n","protected":false},"author":3,"featured_media":874,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[3,30],"tags":[125,131,64,9,134,133,44,62,132,130],"class_list":["post-253","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-dataset","category-nlp","tag-bkd","tag-bkd-corpus","tag-corpus","tag-datasets","tag-entity","tag-named","tag-nlp","tag-thai","tag-thai-named","tag-thai-named-entity-corpus"],"_links":{"self":[{"href":"https:\/\/babyai-hub.vizdata.tech\/index.php?rest_route=\/wp\/v2\/posts\/253","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/babyai-hub.vizdata.tech\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/babyai-hub.vizdata.tech\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/babyai-hub.vizdata.tech\/index.php?rest_route=\/wp\/v2\/users\/3"}],"replies":[{"embeddable":true,"href":"https:\/\/babyai-hub.vizdata.tech\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=253"}],"version-history":[{"count":166,"href":"https:\/\/babyai-hub.vizdata.tech\/index.php?rest_route=\/wp\/v2\/posts\/253\/revisions"}],"predecessor-version":[{"id":2269,"href":"https:\/\/babyai-hub.vizdata.tech\/index.php?rest_route=\/wp\/v2\/posts\/253\/revisions\/2269"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/babyai-hub.vizdata.tech\/index.php?rest_route=\/wp\/v2\/media\/874"}],"wp:attachment":[{"href":"https:\/\/babyai-hub.vizdata.tech\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=253"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/babyai-hub.vizdata.tech\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=253"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/babyai-hub.vizdata.tech\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=253"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}