{"id":64292,"date":"2023-11-20T11:07:11","date_gmt":"2023-11-20T02:07:11","guid":{"rendered":"https:\/\/smilegate.ai\/?p=64292"},"modified":"2023-11-20T11:07:14","modified_gmt":"2023-11-20T02:07:14","slug":"llm%ec%9d%84-%eb%b9%a0%eb%a5%b4%ea%b2%8c-%ec%84%9c%eb%b9%99%ed%95%98%eb%8a%94-%eb%b0%a9%eb%b2%95","status":"publish","type":"post","link":"https:\/\/smilegate.ai\/cn\/2023\/11\/20\/llm%ec%9d%84-%eb%b9%a0%eb%a5%b4%ea%b2%8c-%ec%84%9c%eb%b9%99%ed%95%98%eb%8a%94-%eb%b0%a9%eb%b2%95\/","title":{"rendered":"LLM\uc744 \ube60\ub974\uac8c \uc11c\ube59\ud558\ub294 \ubc29\ubc95"},"content":{"rendered":"
[\ubd84\uc11dAI\uc11c\ube44\uc2a4\ud300 \ubc15\ud6a8\uc8fc]<\/p>\n\n\n\n
\ucd5c\uadfc \uc5b8\uc5b4 \ubaa8\ub378\uc758 \ubc1c\uc804\uc740 \ud070 \uc784\ud329\ud2b8\ub97c \uc8fc\uc5c8\uace0, \uadf8\ub9cc\ud07c \uc5f0\uad6c \ub610\ud55c \ud65c\ubc1c\ud788 \uc774\ub8e8\uc5b4\uc9c0\uace0 \uc788\uc2b5\ub2c8\ub2e4.<\/p>\n\n\n
\uadf8\ub7ec\ub098 \uc774\ub7ec\ud55c \ubaa8\ub378\uc758 \uc11c\ube59 \uacfc\uc815\uc5d0\uc11c \ubc1c\uc0dd\ud558\ub294 \uace0\ub3c4\uc758 \uacc4\uc0b0 \uc791\uc5c5\uacfc \uba54\ubaa8\ub9ac \uc694\uad6c\ub7c9\uc740 \uc0c8\ub85c\uc6b4 \ub3c4\uc804 \uacfc\uc81c\ub97c \uc81c\uc2dc\ud558\uace0 \uc788\uc2b5\ub2c8\ub2e4. \ucd5c\uc801\ud654\ub41c \uc11c\ube59\uc740 \uc774\ub7ec\ud55c LLM\uc744 \ud6a8\uc728\uc801\uc73c\ub85c \uc6b4\uc601\ud558\uae30 \uc704\ud55c \uc911\uc694\ud55c \uc5ed\ud560\uc744 \uc218\ud589\ud569\ub2c8\ub2e4. \ubcf8\ubb38\uc5d0\uc11c\ub294 \uc774\ub7ec\ud55c \uc911\uc694\ud55c \uc5ed\ud560\uc744 \ud558\ub294 LLM \ucd5c\uc801\ud654 \ub77c\uc774\ube0c\ub7ec\ub9ac 3\uac1c\ub97c \uc18c\uac1c\ud569\ub2c8\ub2e4.<\/p>\n\n\n\n
FasterTransformer <\/strong>(GitHub<\/a>)<\/p>\n\n\n\n \uac00\uc7a5 \uc720\uba85\ud55c \ubaa8\ub378 \ucd5c\uc801\ud654 \ub77c\uc774\ube0c\ub7ec\ub9ac\ub85c LLM\uc774 \ub098\uc624\uae30 \uc804\ubd80\ud130 \uc0ac\uc6a9\ub418\ub2e4 LLM\uc774 \ub4f1\uc7a5\ud558\uba74\uc11c \ub354 \ub9ce\uc774 \uc0ac\uc6a9\ub418\uace0 \uc788\uc2b5\ub2c8\ub2e4. \ud070 \ud2b9\uc9d5\uc740 C++, CUDA Native\ub85c \ub3d9\uc791\ud558\uace0 Model Parallel(Tensor Parallel\/Pipeline Parallel) \uae30\ubc18\uc758 \ucd94\ub860 \uac00\uc18d \uc5d4\uc9c4\uc744 \uad6c\ud604\ud588\ub2e4\ub294 \uac83\uc785\ub2c8\ub2e4.<\/p>\n\n\n\n \ub2e4\ub9cc 2023\ub144 \ucd08\uae4c\uc9c0 \ubc84\uc804\uc744 Release\ud558\ub2e4 \ud604\uc7ac\ub294 \uac1c\ubc1c\uc774 \uc911\ub2e8\ub418\uc5c8\uace0, \uc774\ud6c4 \uac1c\ubc1c \uac74\uc774 TensorRT-LLM\uc73c\ub85c \uc774\uad00\ub418\uc5c8\uc2b5\ub2c8\ub2e4.<\/p>\n\n\n\n FasterTransformer\uc5d0\uc11c \uc9c0\uc6d0\ub418\ub294 \ubaa8\ub378\uc758 \uc885\ub958\ub294 \uc5ec\uae30<\/a>\uc11c \ud655\uc778\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.<\/p>\n\n\n\n vLLM<\/strong> (GitHub<\/a>)<\/p>\n\n\n\n 2023\ub144 6\uc6d4\uc5d0 \ub4f1\uc7a5\ud574\uc11c \ud604\uc7ac \uc778\uae30\uc788\ub294 \ubaa8\ub378 \ucd5c\uc801\ud654 \ub77c\uc774\ube0c\ub7ec\ub9ac \uc911 \ud558\ub098 \uc785\ub2c8\ub2e4. \ucc98\uc74c \ub4f1\uc7a5\ud588\uc744 \ub54c \uc9c0\uc6d0\ub418\ub294 \ubaa8\ub378\uc774 \ub9ce\uc9c0 \uc54a\uc558\ub358 FasterTransformer\uc5d0 \ube44\ud574 \ub2e4\uc218\uc758 \ucd5c\uc2e0 \ubaa8\ub378\uc744 \uc9c0\uc6d0\ud558\uba74\uc11c \ucd5c\uc2e0 \uc5b8\uc5b4 \ubaa8\ub378\ub4e4\uc744 \ud6a8\uc728\uc801\uc73c\ub85c \ub2e4\ub8f0 \uc218 \uc788\uac8c \ub418\uc5c8\uc2b5\ub2c8\ub2e4. \ud070 \ud2b9\uc9d5\uc740 Paged Attention \uae30\ubc95\uc744 \ud65c\uc6a9\ud574\uc11c latency\ub97c \uc720\uc9c0\ud558\uba74\uc11c throughput\uc740 2-4X \ud5a5\uc0c1\ud588\ub2e4\ub294 \uac83\uc785\ub2c8\ub2e4.<\/p>\n\n\n\n vLLM\uc5d0\uc11c \uc9c0\uc6d0\ub418\ub294 \ubaa8\ub378\uc758 \uc885\ub958\ub294 \uc5ec\uae30<\/a>\uc11c \ud655\uc778\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.<\/p>\n\n\n\n TensorRT-LLM<\/strong> (GitHub<\/a>)<\/p>\n\n\n\n \uac00\uc7a5 \ucd5c\uadfc\uc5d0 \uacf5\uac1c(2023\ub144 10\uc6d4)\ud55c \ub77c\uc774\ube0c\ub7ec\ub9ac\ub85c, FasterTransformer\uc758 \ud6c4\uc18d \ubc84\uc804\uc785\ub2c8\ub2e4. \uc55e\uc5d0\uc11c \uc124\uba85\ub4dc\ub9b0 \uac83\ucc98\ub7fc FasterTransformer\uc758 \uac1c\ubc1c\uc774 \uc911\ub2e8\ub418\uace0 \uc774\ud6c4 \uae30\ub2a5\uc774 \ubaa8\ub450 TensorRT-LLM\uc5d0 \uc774\uad00 \uac1c\ubc1c\ub418\uc5c8\uc2b5\ub2c8\ub2e4. \ud070 \ud2b9\uc9d5\uc740 TensorRT \ub525\ub7ec\ub2dd \ucef4\ud37c\uc77c\ub7ec\uc640 \ucd5c\uc801\ud654\ub41c \ucee4\ub110, \uc804\/\ud6c4\ucc98\ub9ac \ub2e8\uacc4, Multi-GPU\/Multi-Node \ud1b5\uc2e0 \uae30\ubcf8\uc694\uc18c\uc5d0 In-flight Batching\uc774\ub77c \ubd88\ub9ac\ub294 \ucd5c\uc801\ud654\ub41c \uc2a4\ucf00\uc904\ub9c1 \uae30\uc220\uc774 \ud3ec\ud568\ub410\ub2e4\ub294 \uac83 \uc785\ub2c8\ub2e4.<\/p>\n\n\n\n