Use Swama from 4D
Swama is a high-performance MLX-based LLM inference engine designed specifically for macOS.
Instantiate cs.swama.swama in your On Startup database method:
var $swama : cs.swama.swama
If (False)
$swama:=cs.swama.swama.new() //default
Else
var $homeFolder : 4D.Folder
$homeFolder:=Folder(fk home folder).folder(".MLX")
var $file : 4D.File
var $URL : Text
var $port : Integer
var $event : cs.event.event
$event:=cs.event.event.new()
/*
Function onError($params : Object; $error : cs.event.error)
Function onSuccess($params : Object; $models : cs.event.models)
Function onData($request : 4D.HTTPRequest; $event : Object)
Function onResponse($request : 4D.HTTPRequest; $event : Object)
Function onTerminate($worker : 4D.SystemWorker; $params : Object)
*/
$event.onError:=Formula(ALERT($2.message))
$event.onSuccess:=Formula(ALERT($2.models.extract("name").join(",")+" loaded!"))
$event.onData:=Formula(LOG EVENT(Into 4D debug message; This.file.fullName+":"+String((This.range.end/This.range.length)*100; "###.00%")))
$event.onData:=Formula(MESSAGE(This.file.fullName+":"+String((This.range.end/This.range.length)*100; "###.00%")))
$event.onResponse:=Formula(LOG EVENT(Into 4D debug message; This.file.fullName+":download complete"))
$event.onResponse:=Formula(MESSAGE(This.file.fullName+":download complete"))
$event.onTerminate:=Formula(LOG EVENT(Into 4D debug message; (["process"; $1.pid; "terminated!"].join(" "))))
$port:=8085
$options:={host: "127.0.0.1"}
var $huggingfaces : cs.event.huggingfaces
$folder:=$homeFolder.folder("Qwen3-4B-Thinking-2507")
$path:="keisuke-miyako/Qwen3-4B-Thinking-2507-mlx-4bit"
$URL:="keisuke-miyako/Qwen3-4B-Thinking-2507-mlx-4bit"
$chat:=cs.event.huggingface.new($folder; $URL; $path)
$huggingfaces:=cs.event.huggingfaces.new([$chat])
$swama:=cs.swama.swama.new($port; $huggingfaces; $homeFolder; $options; $event)
End if
Now you can test the server:
curl -X POST http://localhost:8080/v1/embeddings \
-H "Content-Type: application/json" \
-d '{
"input": ["Hello world", "Text embeddings"],
"model": "mlx-community/Qwen3-Embedding-0.6B-8bit"
}'
curl -X POST http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gemma3",
"messages": [
{"role": "user", "content": "Hello!"}
],
"temperature": 0.7,
"max_tokens": 100,
"stream": true
}'
Or, use AI Kit:
var $AIClient : cs.AIKit.OpenAI
$AIClient:=cs.AIKit.OpenAI.new()
$AIClient.baseURL:="http://127.0.0.1:8080/v1"
var $text : Text
$text:="The quick brown fox jumps over the lazy dog."
var $responseEmbeddings : cs.AIKit.OpenAIEmbeddingsResult
$responseEmbeddings:=$AIClient.embeddings.create($text)
Finally to terminate the server:
var $swama : cs.swama.swama
$swama:=cs.swama.swama.new()
$swama.terminate()
The API is compatibile with Open AI.
| Class | API | Availability |
|---|---|---|
| Models | /v1/models |
✅ |
| Chat | /v1/chat/completions |
✅ |
| Images | /v1/images/generations |
|
| Moderations | /v1/moderations |
|
| Embeddings | /v1/embeddings |
✅ |
| Files | /v1/files |