If you use Cactus in your research, please cite it as follows:
@software{cactus,
title = {Cactus: AI Inference Engine for Phones & Wearables},
author = {Ndubuaku, Henry and Cactus Team},
url = {https://github.com/cactus-compute/cactus},
year = {2025}
}
N/B: Scroll all the way up and click the shields link for resources!
#include"cactus_engine.h"cactus_model_t model = cactus_init(
"path/to/weight/folder",
"path to txt or dir of txts for auto-rag",
false
);
constchar* messages = R"([
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "My name is Henry Ndubuaku"}
])";
constchar* options = R"({
"max_tokens": 50,
"stop_sequences": ["<|im_end|>"]
})";
char response[4096];
int result = cactus_complete(
model, // model handle
messages, // JSON chat messages
response, // response buffersizeof(response), // buffer size
options, // generation optionsnullptr, // tools JSONnullptr, // streaming callbacknullptr, // user datanullptr, // pcm audio buffer0// pcm buffer size
);
{"success":true,// generation succeeded"error":null,// error details if failed"cloud_handoff":false,// true if cloud model used"response":"Hi there!","function_calls":[],// parsed tool calls"segments":[],// transcription segments (empty for chat)"confidence":0.8193,// model confidence"confidence_threshold":0.7,// resolved handoff threshold (model-dependent)"time_to_first_token_ms":45.23,"total_time_ms":163.67,"prefill_tps":1621.89,"decode_tps":168.42,"ram_usage_mb":245.67,"prefill_tokens":28,"decode_tokens":50,"total_tokens":78}
#include"cactus_graph.h"
CactusGraph graph;
auto a = graph.input({2, 3}, Precision::FP16);
auto b = graph.input({3, 4}, Precision::INT8);
auto x1 = graph.matmul(a, b, false);
auto x2 = graph.transpose(x1);
auto result = graph.matmul(b, x2, true);
float a_data[6] = {1.1f, 2.3f, 3.4f, 4.2f, 5.7f, 6.8f};
float b_data[12] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
graph.set_input(a, a_data, Precision::FP16);
graph.set_input(b, b_data, Precision::INT8);
graph.execute();
void* output_data = graph.get_output(result);
graph.hard_reset();