{"id":62267,"date":"2022-05-31T15:15:48","date_gmt":"2022-05-31T06:15:48","guid":{"rendered":"https:\/\/smilegate.ai\/?p=62267"},"modified":"2022-05-31T15:17:44","modified_gmt":"2022-05-31T06:17:44","slug":"korean-tokenizer","status":"publish","type":"post","link":"https:\/\/smilegate.ai\/cn\/2022\/05\/31\/korean-tokenizer\/","title":{"rendered":"\ud55c\uad6d\uc5b4 tokenizer"},"content":{"rendered":"
[\uc0dd\uc131\uc9c0\ub2a5\uac1c\ubc1c\ud300 \uae40\uc131\ud604] \uac00\uc7a5 \ub2e8\uc21c\ud55c \ud1a0\ud06c\ub098\uc774\uc9d5 \ub2e8\uc704\ub97c \uc0dd\uac01\ud574\ubcf8\ub2e4\uba74, \uc5b4\uc808(\ub744\uc5b4\uc4f0\uae30) \ub2e8\uc704\ub85c \uc790\ub974\ub294 \ubc29\ubc95\uc774 \uc788\uc744 \uac83\uc785\ub2c8\ub2e4. \uc624\ub298 \uc18c\uac1c\ub4dc\ub9b4 \ub17c\ubb38\uc740, \uc774 \ud1a0\ud06c\ub098\uc774\uc9d5\uacfc \uad00\ub828\ub41c \uc5f0\uad6c\uc785\ub2c8\ub2e4. (\uc5f0\uad6c git \ub9c1\ud06c<\/a>, \ub17c\ubb38 \ub9c1\ud06c<\/a>)<\/p>\n\n\n\n \ubcf8 \uc5f0\uad6c\ub294 \ud55c\uad6d\uc5b4 \uc5b8\uc5b4\ubaa8\ub378\uc5d0\uc11c\uc758 tokenizing \ubc29\uc2dd\uc5d0 \ub530\ub978 \uc131\ub2a5 \ud3c9\uac00\ub97c \uc9c4\ud589\ud558\uc600\uace0, \uadf8 \ub2e8\uc704\ub294 \uacb0\uacfc\ub294 \uc5b4\ub560\uc744\uae4c\uc694?<\/p>\n\n\n\n \uacb0\uacfc\uc801\uc73c\ub85c\ub294 morpheme-aware subword \ubc29\uc2dd\uc774 \ud55c\uad6d\uc5b4 \uc790\uc5f0\uc5b4\ucc98\ub9ac\uc5d0\uc11c \uac00\uc7a5 \uc88b\uc740 \uc131\ub2a5\uc744 \ub098\ud0c0\ub0b4\uc8fc\uc5c8\uc2b5\ub2c8\ub2e4 \ud83d\ude42 \uc774\ub7f0 \uc120\ud589 \uc5f0\uad6c\ub4e4\uc744 \ubc14\ud0d5\uc73c\ub85c, \uc5b8\uc5b4\ubaa8\ub378 \ud639\uc740 \uc790\uc5f0\uc5b4\ucc98\ub9ac\ub97c \uc9c4\ud589\ud560 \ub54c, \uac00\uc7a5 \uc801\uc808\ud55c \ud55c\uad6d\uc5b4 \ud1a0\ud06c\ub098\uc774\uc800\ub97c \uc120\ud0dd\ud560 \uc218 \uc788\uc744 \uac83\uc785\ub2c8\ub2e4. <\/p>\n [\uc0dd\uc131\uc9c0\ub2a5\uac1c\ubc1c\ud300 \uae40\uc131\ud604]\uc6b0\ub9ac\ub294 \ud55c\uad6d\uc5b4 \ubb38\uc11c\ub97c \ubcfc \ub54c, \uadf8 \ub300\uc0c1\uc744 \uc758\ubbf8\ub97c \uac00\uc9c0\ub294 \ub2e8\uc704\ub85c \ucabc\uac8c\uc11c \uc774\ud574\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.\uc608\ub97c \ub4e4\uc5b4, “\uc2a4\ub9c8\uc77c\uac8c\uc774\ud2b8” \uc758 \uacbd\uc6b0, “\uc2a4\ub9c8\uc77c” \uacfc, \ubb38\uc744 \uc758\ubbf8\ud558\ub294 “\uac8c\uc774\ud2b8”\ub85c \uc774\ud574\ud560 \uc218 \uc788\uaca0\uc8e0.\uc774\ub807\uac8c \uc790\uc5f0\uc5b4\ub97c \ucabc\uac8c\ub294 \ud589\uc704\ub97c \ud1a0\ud06c\ub098\uc774\uc9d5 (tokenizing) \uc774\ub77c\uace0 \ud569\ub2c8\ub2e4. \uac00\uc7a5 \ub2e8\uc21c\ud55c \ud1a0\ud06c\ub098\uc774\uc9d5 \ub2e8\uc704\ub97c \uc0dd\uac01\ud574\ubcf8\ub2e4\uba74, \uc5b4\uc808(\ub744\uc5b4\uc4f0\uae30) \ub2e8\uc704\ub85c \uc790\ub974\ub294 \ubc29\ubc95\uc774 \uc788\uc744 \uac83\uc785\ub2c8\ub2e4.\uc774 \uacbd\uc6b0, \uc601\uc5b4\uc640 \uac19\uc740 \uc5b8\uc5b4\uc5d0\uc11c\ub294 \uc758\ubbf8\ub97c \ucabc\uac8c\ub294\ub370 \uaf64\ub098 \ud6a8\uacfc\uc801\uc73c\ub85c \ub3d9\uc791\ud560 \uc218…<\/p>\n
\uc6b0\ub9ac\ub294 \ud55c\uad6d\uc5b4 \ubb38\uc11c\ub97c \ubcfc \ub54c, \uadf8 \ub300\uc0c1\uc744 \uc758\ubbf8\ub97c \uac00\uc9c0\ub294 \ub2e8\uc704\ub85c \ucabc\uac8c\uc11c \uc774\ud574\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.
\uc608\ub97c \ub4e4\uc5b4, “\uc2a4\ub9c8\uc77c\uac8c\uc774\ud2b8” \uc758 \uacbd\uc6b0, “\uc2a4\ub9c8\uc77c” \uacfc, \ubb38\uc744 \uc758\ubbf8\ud558\ub294 “\uac8c\uc774\ud2b8”\ub85c \uc774\ud574\ud560 \uc218 \uc788\uaca0\uc8e0.
\uc774\ub807\uac8c \uc790\uc5f0\uc5b4\ub97c \ucabc\uac8c\ub294 \ud589\uc704\ub97c \ud1a0\ud06c\ub098\uc774\uc9d5 (tokenizing) \uc774\ub77c\uace0 \ud569\ub2c8\ub2e4.<\/p>\n\n\n\n
\ud1a0\ud06c\ub098\uc774\uc9d5\uc758 \ub2e4\uc591\ud55c \ub2e8\uc704\ub4e4<\/figcaption><\/figure>\n\n\n\n
\uc774 \uacbd\uc6b0, \uc601\uc5b4\uc640 \uac19\uc740 \uc5b8\uc5b4\uc5d0\uc11c\ub294 \uc758\ubbf8\ub97c \ucabc\uac8c\ub294\ub370 \uaf64\ub098 \ud6a8\uacfc\uc801\uc73c\ub85c \ub3d9\uc791\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.
\ud558\uc9c0\ub9cc \ud55c\uad6d\uc5b4\ub294 \uc870\uc0ac\ub098 \uc5b4\ubbf8\ub97c \ubd99\uc5ec\uc11c \ub9d0\uc744 \ub9cc\ub4dc\ub294 \uad50\ucc29\uc5b4\ub85c, \ub744\uc5b4\uc4f0\uae30 \ub9cc\uc73c\ub85c\ub294 \uc758\ubbf8\ub97c \uac00\uc9c0\ub294 \ucd5c\uc18c\ub2e8\uc704\ub85c \ud45c\ud604\ud558\uae30 \uc5b4\ub824\uc6cc\uc9c0\uc8e0.
\uc608\ub97c \ub4e4\uc5b4, \uc601\uc5b4\uc5d0\uc11c\uc758 he\/him \uac19\uc740 \ub2e8\uc5b4\uac00 \uc5b4\uc808 \ub2e8\uc704\ub85c \ucabc\uaca0\uc744 \ub54c, \uadf8 \uc758\ubbf8\ub97c \uac00\uc9c0\ub294 \ubc18\uba74, \ud55c\uad6d\uc5b4\uc5d0\uc11c\ub294 ‘\uadf8’, ‘\uadf8\uac00’, ‘\uadf8\ub294’, ‘\uadf8\ub97c’, ‘\uadf8\uc758’, ‘\uadf8\uc5d0\uac8c’ \ub4f1\uacfc \uac19\uc740 \ud615\ud0dc\ub85c, \ub354 \uc138\uc138\ud558\uac8c \ucabc\uac8c\uc9c8 \uc218 \uc788\uc2b5\ub2c8\ub2e4.
\ub530\ub77c\uc11c \uc77c\ubc18\uc801\uc73c\ub85c, \ud55c\uad6d\uc5b4\ub294 \uc758\ubbf8\ub97c \uac00\uc9c0\ub294 \ucd5c\uc18c \ub2e8\uc704\uc778 \ud615\ud0dc\uc18c\uc758 \ud615\ud0dc\ub85c \ud1a0\ud06c\ub098\uc774\uc9d5\uc744 \uc218\ud589\ud569\ub2c8\ub2e4.
\uc608\ub97c \ub4e4\uc5b4, ‘\uc548\ub155\ud558\uc138\uc694’ \uc758 \uacbd\uc6b0, [‘\uc548\ub155’, ‘\ud558’, ‘\uc138’, ‘\uc694’] \uc758 \ud615\ud0dc\ub85c \ucabc\uac8c\uc9c0\ub294 \uac83\uc774\uc8e0.
\ubc18\uba74\uc5d0, \ud615\ud0dc\uc18c\ubcf4\ub2e4 \ub354 \uc791\uc740 \ub2e8\uc704\uc778 \uc74c\uc808 \ub2e8\uc704\ub294 \uc5b4\ub5a8\uae4c\uc694?
[‘\uc548’, ‘\ub155’, ‘\ud558’, ‘\uc138’, ‘\uc694’] \ub97c \uad00\ucc30\ud574\ubcfc \ub54c, \uc624\ud788\ub824 \uc6d0\ubb38\uc758 \uc758\ubbf8\uac00 \ud30c\uad34\ub41c \uac83\uc744 \ubcfc \uc218 \uc788\uc2b5\ub2c8\ub2e4.
\uc790\uc18c \ub2e8\uc704\ub85c \ucabc\uac8c\ub294 \uacbd\uc6b0\uc778 [‘\u3147’, ‘\u314f’, ‘\u3134’, ‘\u3134’ … ‘\u3147’, ‘\u315b’] \ub3c4 \ube44\uc2b7\ud558\uc8e0.
\ubb3c\ub860, \uc774\ub807\uac8c \uc790\uc18c \ub2e8\uc704\uc758 \ud1a0\ud06c\ub098\uc774\uc9d5\uc774 \uc624\ud0c8\uc790 \uc784\ubca0\ub529\uc5d0\uc11c \uc88b\uc740 \uc131\ub2a5\uc744 \ub098\ud0c0\ub0b8\ub2e4\ub294 \uc5f0\uad6c \uacb0\uacfc\ub3c4 \uc788\uc2b5\ub2c8\ub2e4 (\u5173\u8054<\/a>)
\uadf8\ub798\uc11c \ubaa9\uc801\uc5d0 \ub530\ub77c \ud1a0\ud06c\ub098\uc774\uc9d5\uc758 \ub2e8\uc704\ub97c \uc798 \uace0\ub974\ub294 \uac83\ub3c4 \uc544\uc8fc \uc911\uc694\ud558\uc8e0.<\/p>\n\n\n\n<\/figure>\n\n\n\n
(1) \uc790\uc18c, (2) \uc74c\uc808, (3) \ud615\ud0dc\uc18c, (4) subword, (5) \ud615\ud0dc\uc18c\ubd84\uc11d + subword\ub85c \uc0dd\uc131\ud55c vocab\uc744 \uc774\uc6a9\ud574 Subword \uae30\ubc18 \ud1a0\ud06c\ub098\uc774\uc9d5, (6) \ub744\uc5b4\uc4f0\uae30 \uae30\ubc18
\uc73c\ub85c \uc2e4\ud5d8\uc744 \uc9c4\ud589\ud558\uc600\uc2b5\ub2c8\ub2e4.<\/p>\n\n\n\n<\/figure>\n\n\n\n
<\/p>\n\n\n\n
\ucd5c\uadfc\uc5d0\ub294 morpheme-aware bbpe (byte-level byte pair encoder)\uac00 \uc801\uc6a9\ub418\ub294 \uc5f0\uad6c\ub3c4 \uc788\ub354\ub77c\uad6c\uc694!<\/p>\n\n\n\n