llama.cpp是一个C++编写的轻量级开源类AIGC大模型框架,可以支持在消费级普通设备上本地部署运行大模型,以及作为依赖库集成的到应用程序中提供类GPT的功能。
以下基于llama.cpp的源码利用C++ api来开发实例demo演示加载本地模型文件并提供GPT文本生成。
项目结构
llamacpp_starter - llama.cpp-b1547 - src |- main.cpp - CMakeLists.txt
CMakeLists.txt
cmake_minimum_required(VERSION 3.15)# this only works for unix, xapian source code not support compile in windows yetproject(llamacpp_starter)set(CMAKE_CXX_STANDARD 14)set(CMAKE_CXX_STANDARD_REQUIRED ON)add_subdirectory(llama.cpp-b1547)include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp-b1547 ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp-b1547/common)file(GLOB SRC src/*.h src/*.cpp)add_executable(${PROJECT_NAME} ${SRC})target_link_libraries(${PROJECT_NAME} common llama)
main.cpp
#include <iostream>#include <string>#include <vector>#include "common.h"#include "llama.h"int main(int argc, char** argv){ bool numa_support = false; const std::string model_file_path = "./llama-ggml.gguf"; const std::string prompt = "once upon a time"; // input words const int n_len = 32; // total length of the sequence including the prompt // set gpt params gpt_params params; params.model = model_file_path; params.prompt = prompt; // init LLM llama_backend_init(false); // load model llama_model_params model_params = llama_model_default_params(); //model_params.n_gpu_layers = 99; // offload all layers to the GPU llama_model* model = llama_load_model_from_file(model_file_path.c_str(), model_params); if (model == NULL) { std::cerr << __func__ << " load model file error" << std::endl; return 1; } // init context llama_context_params ctx_params = llama_context_default_params(); ctx_params.seed = 1234; ctx_params.n_ctx = 2048; ctx_params.n_threads = params.n_threads; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; llama_context* ctx = llama_new_context_with_model(model, ctx_params); if (ctx == NULL) { std::cerr << __func__ << " failed to create the llama_context" << std::endl; return 1; } // tokenize the prompt std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true); const int n_ctx = llama_n_ctx(ctx); const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); // make sure the KV cache is big enough to hold all the prompt and generated tokens if (n_kv_req > n_ctx) { std::cerr << __func__ << " error: n_kv_req > n_ctx, the required KV cache size is not big enough" << std::endl; std::cerr << __func__ << " either reduce n_parallel or increase n_ctx" << std::endl; return 1; } // print the prompt token-by-token for (auto id : tokens_list) std::cout << llama_token_to_piece(ctx, id) << " "; std::cout << std::endl; // create a llama_batch with size 512 // we use this object to submit token data for decoding llama_batch batch = llama_batch_init(512, 0, 1); // evaluate the initial prompt for (size_t i = 0; i < tokens_list.size(); i++) llama_batch_add(batch, tokens_list[i], i, { 0 }, false); // llama_decode will output logits only for the last token of the prompt batch.logits[batch.n_tokens - 1] = true; if (llama_decode(ctx, batch) != 0) { std::cerr << __func__ << " llama_decode failed" << std::endl; return 1; } // main loop to generate words int n_cur = batch.n_tokens; int n_decode = 0; const auto t_main_start = ggml_time_us(); while (n_cur <= n_len) { // sample the next token auto n_vocab = llama_n_vocab(model); auto* logits = llama_get_logits_ith(ctx, batch.n_tokens - 1); std::vector<llama_token_data> candidates; candidates.reserve(n_vocab); for (llama_token token_id = 0; token_id < n_vocab; token_id++) { candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); } llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; // sample the most likely token const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); // is it an end of stream? if (new_token_id == llama_token_eos(model) || n_cur == n_len) { std::cout << std::endl; break; } std::cout << llama_token_to_piece(ctx, new_token_id) << " "; // prepare the next batch llama_batch_clear(batch); // push this new token for next evaluation llama_batch_add(batch, new_token_id, n_cur, { 0 }, true); n_decode += 1; n_cur += 1; // evaluate the current batch with the transformer model if (llama_decode(ctx, batch)) { std::cerr << __func__ << " failed to eval" << std::endl; return 1; } } std::cout << std::endl; const auto t_main_end = ggml_time_us(); std::cout << __func__ << " decoded " << n_decode << " tokens in " << (t_main_end - t_main_start) / 1000000.0f << " s, speed: " << n_decode / ((t_main_end - t_main_start) / 1000000.0f) << " t / s" << std::endl; llama_print_timings(ctx); llama_batch_free(batch); // free context llama_free(ctx); llama_free_model(model); // free LLM llama_backend_free(); return 0;}
注:
- llama支持的模型文件需要自己去下载,推荐到huggingface官网下载转换好的gguf格式文件
- llama.cpp编译可以配置多种类型的增强选项,比如支持CPU/GPU加速,数据计算加速库
源码
llamacpp_starter
本文由博客一文多发平台 OpenWrite 发布!