开源大模型框架llama.cpp使用C++ api开发入门

2024-02-22 AIGC 0

llama.cpp是一个C++编写的轻量级开源类AIGC大模型框架，可以支持在消费级普通设备上本地部署运行大模型，以及作为依赖库集成的到应用程序中提供类GPT的功能。

以下基于llama.cpp的源码利用C++ api来开发实例demo演示加载本地模型文件并提供GPT文本生成。

项目结构

llamacpp_starter	- llama.cpp-b1547	- src	  |- main.cpp	- CMakeLists.txt

CMakeLists.txt

cmake_minimum_required(VERSION 3.15)# this only works for unix, xapian source code not support compile in windows yetproject(llamacpp_starter)set(CMAKE_CXX_STANDARD 14)set(CMAKE_CXX_STANDARD_REQUIRED ON)add_subdirectory(llama.cpp-b1547)include_directories(    ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp-b1547    ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp-b1547/common)file(GLOB SRC    src/*.h    src/*.cpp)add_executable(${PROJECT_NAME} ${SRC})target_link_libraries(${PROJECT_NAME}    common    llama)

main.cpp

#include <iostream>#include <string>#include <vector>#include "common.h"#include "llama.h"int main(int argc, char** argv){	bool numa_support = false;	const std::string model_file_path = "./llama-ggml.gguf";	const std::string prompt = "once upon a time"; // input words	const int n_len = 32; 	// total length of the sequence including the prompt	// set gpt params	gpt_params params;	params.model = model_file_path;	params.prompt = prompt;	// init LLM	llama_backend_init(false);	// load model	llama_model_params model_params = llama_model_default_params();	//model_params.n_gpu_layers = 99; // offload all layers to the GPU	llama_model* model = llama_load_model_from_file(model_file_path.c_str(), model_params);	if (model == NULL)	{		std::cerr << __func__ << " load model file error" << std::endl;		return 1;	}	// init context	llama_context_params ctx_params = llama_context_default_params();	ctx_params.seed = 1234;	ctx_params.n_ctx = 2048;	ctx_params.n_threads = params.n_threads;	ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;	llama_context* ctx = llama_new_context_with_model(model, ctx_params);	if (ctx == NULL)	{		std::cerr << __func__ << " failed to create the llama_context" << std::endl;		return 1;	}	// tokenize the prompt	std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);	const int n_ctx = llama_n_ctx(ctx);	const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());	// make sure the KV cache is big enough to hold all the prompt and generated tokens	if (n_kv_req > n_ctx)	{		std::cerr << __func__ << " error: n_kv_req > n_ctx, the required KV cache size is not big enough" << std::endl;		std::cerr << __func__ << " either reduce n_parallel or increase n_ctx" << std::endl;		return 1;	}	// print the prompt token-by-token	for (auto id : tokens_list)		std::cout << llama_token_to_piece(ctx, id) << " ";	std::cout << std::endl;	// create a llama_batch with size 512	// we use this object to submit token data for decoding	llama_batch batch = llama_batch_init(512, 0, 1);	// evaluate the initial prompt	for (size_t i = 0; i < tokens_list.size(); i++)		llama_batch_add(batch, tokens_list[i], i, { 0 }, false);	// llama_decode will output logits only for the last token of the prompt	batch.logits[batch.n_tokens - 1] = true;	if (llama_decode(ctx, batch) != 0)	{		std::cerr << __func__ << " llama_decode failed" << std::endl;		return 1;	}	// main loop to generate words	int n_cur = batch.n_tokens;	int n_decode = 0;	const auto t_main_start = ggml_time_us();	while (n_cur <= n_len)	{		// sample the next token		auto n_vocab = llama_n_vocab(model);		auto* logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);		std::vector<llama_token_data> candidates;		candidates.reserve(n_vocab);		for (llama_token token_id = 0; token_id < n_vocab; token_id++)		{			candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });		}		llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };		// sample the most likely token		const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);		// is it an end of stream?		if (new_token_id == llama_token_eos(model) || n_cur == n_len)		{			std::cout << std::endl;			break;		}		std::cout << llama_token_to_piece(ctx, new_token_id) << " ";		// prepare the next batch		llama_batch_clear(batch);		// push this new token for next evaluation		llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);		n_decode += 1;		n_cur += 1;		// evaluate the current batch with the transformer model		if (llama_decode(ctx, batch))		{			std::cerr << __func__ << " failed to eval" << std::endl;			return 1;		}	}	std::cout << std::endl;	const auto t_main_end = ggml_time_us();	std::cout << __func__ << " decoded " << n_decode << " tokens in " << (t_main_end - t_main_start) / 1000000.0f << " s, speed: " << n_decode / ((t_main_end - t_main_start) / 1000000.0f) << " t / s" << std::endl;	llama_print_timings(ctx);	llama_batch_free(batch);	// free context	llama_free(ctx);	llama_free_model(model);	// free LLM	llama_backend_free();	return 0;}