手把手教程:用 Transformers 部署 Gemma 小模型,打造自己的 AI 函数调用服务
版本: v0.0.1
作者: 基于实际项目生成
难度: 小白友好
一、为什么要折腾这个?
1.1 问题场景
你想用 AI 模型做函数调用(比如问天气、查数据),但:
- Ollama 支持的模型太少
- 想用 HuggingFace 上的海量模型
- 不想花大钱买 API
1.2 解决方案
用 Transformers + FastAPI 搭建自己的服务:
- ✅ 支持 HuggingFace 所有模型
- ✅ 本地免费测试
- ✅ 部署到云端也能用
- ✅ OpenAI 兼容,方便集成
1.3 为什么选 Gemma-270M?
- 够小:1GB,免费资源跑得动
- 够用:专门训练做函数调用
- 够快:响应时间可接受
二、准备工作(5分钟)
2.1 安装 Python 环境
# 推荐 Python 3.9+
python --version
2.2 安装依赖
pip install fastapi uvicorn[standard] transformers torch accelerate python-dotenv python-multipart huggingface_hub
2.3 准备项目结构
my_gemma_service/
├── app.py # 主程序
├── utils/
│ ├── chat_request.py # 请求验证
│ ├── chat_response.py # 响应生成
│ └── model.py # 模型管理
├── .env # 配置文件
└── requirements.txt # 依赖列表
三、代码实现(跟着抄)
3.1 创建 .env 文件
# 文件名: .env
DEFAULT_MODEL_NAME="unsloth/functiongemma-270m-it"
3.2 utils/model.py - 模型管理
import os
from pathlib import Path
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from fastapi import HTTPException
from pydantic import BaseModel
class DownloadRequest(BaseModel):
model: str
def check_model(model_name):
"""检查模型是否存在"""
cache_dir = "./my_model_cache"
model_path = Path(cache_dir) / f"models--{model_name.replace('/', '--')}"
snapshot_path = model_path / "snapshots"
if snapshot_path.exists() and any(snapshot_path.iterdir()):
print(f"✅ 模型 {model_name} 已存在")
return model_name, cache_dir, True
print(f"❌ 模型 {model_name} 不存在")
return model_name, cache_dir, False
def download_model(model_name):
"""下载模型"""
cache_dir = "./my_model_cache"
print(f"📥 开始下载: {model_name}")
# 如果需要登录(下载私有模型)
token = os.getenv("HUGGINGFACE_TOKEN")
if token:
login(token=token)
try:
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
print(f"✅ 下载成功!")
return True, f"模型 {model_name} 下载成功"
except Exception as e:
return False, f"下载失败: {str(e)}"
def initialize_pipeline(model_name):
"""初始化模型"""
model_name, cache_dir, success = check_model(model_name)
if not success:
return None, None, False
try:
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
pipe = pipeline("text-generation", model=model_name, tokenizer=tokenizer)
print(f"✅ Pipeline 初始化成功!")
return pipe, tokenizer, True
except Exception as e:
print(f"❌ 初始化失败: {e}")
return None, None, False
3.3 utils/chat_request.py - 请求验证
from pydantic import BaseModel
from typing import List, Optional, Dict, Any
class ChatRequest(BaseModel):
model: Optional[str] = "unsloth/functiongemma-270m-it"
messages: List[Dict[str, Any]]
temperature: Optional[float] = 1.0
max_tokens: Optional[int] = None
top_p: Optional[float] = 1.0
frequency_penalty: Optional[float] = 0.0
presence_penalty: Optional[float] = 0.0
3.4 utils/chat_response.py - 响应生成
from pydantic import BaseModel
from typing import List, Dict, Any
import time
import re
class ChatChoice(BaseModel):
index: int
message: Dict[str, str]
finish_reason: str
class ChatUsage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class ChatResponse(BaseModel):
id: str
object: str
created: int
model: str
choices: List[ChatChoice]
usage: ChatUsage
def convert_json_format(input_data):
"""转换格式"""
output_generations = []
for item in input_data:
generated_text_list = item.get('generated_text', [])
assistant_content = ""
for message in generated_text_list:
if message.get('role') == 'assistant':
assistant_content = message.get('content', '')
break
clean_content = re.sub(r'</think>.*?</think>\s*', '', assistant_content, flags=re.DOTALL).strip()
output_generations.append([{"text": clean_content, "generationInfo": {"finish_reason": "stop"}}])
return {"generations": output_generations}
def create_chat_response(request, pipe, tokenizer):
"""创建聊天响应"""
if pipe is None:
return ChatResponse(
id=f"chatcmpl-{int(time.time())}",
object="chat.completion",
created=int(time.time()),
model=request.model,
choices=[ChatChoice(index=0, message={"role": "assistant", "content": "模型正在初始化中..."}, finish_reason="stop")],
usage=ChatUsage(prompt_tokens=0, completion_tokens=0, total_tokens=0)
)
max_new_tokens = request.max_tokens if request.max_tokens is not None else 500
result = pipe(request.messages, max_new_tokens=max_new_tokens)
converted_result = convert_json_format(result)
completion_text = converted_result["generations"][0][0]["text"]
prompt_tokens = sum(len(tokenizer.encode(msg.get("content", ""))) for msg in request.messages)
completion_tokens = len(tokenizer.encode(completion_text))
return ChatResponse(
id=f"chatcmpl-{int(time.time())}",
object="chat.completion",
created=int(time.time()),
model=request.model,
choices=[ChatChoice(index=0, message={"role": "assistant", "content": completion_text}, finish_reason="stop")],
usage=ChatUsage(prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=prompt_tokens + completion_tokens)
)
3.5 app.py - 主程序
from fastapi import FastAPI, HTTPException
import os
from dotenv import load_dotenv
from utils.chat_request import ChatRequest
from utils.chat_response import create_chat_response, ChatResponse
from utils.model import check_model, initialize_pipeline, download_model, DownloadRequest
# 全局变量
model_name = None
pipe = None
tokenizer = None
app = FastAPI(title="Gemma 函数调用服务", version="1.0.0")
@app.on_event("startup")
async def startup_event():
"""启动时加载模型"""
global pipe, tokenizer, model_name
load_dotenv()
default_model = os.getenv("DEFAULT_MODEL_NAME", "unsloth/functiongemma-270m-it")
print(f"🚀 正在加载: {default_model}")
try:
pipe, tokenizer, success = initialize_pipeline(default_model)
if success:
model_name = default_model
print(f"✅ 加载成功!")
else:
print(f"⚠️ 需要先下载模型")
except Exception as e:
print(f"❌ 启动失败: {e}")
@app.get("/")
async def read_root():
return {
"message": "Gemma 服务已启动!",
"current_model": model_name,
"status": "ready" if pipe else "waiting"
}
@app.post("/download")
async def download_model_endpoint(request: DownloadRequest):
"""下载模型"""
global pipe, tokenizer, model_name
success, message = download_model(request.model)
if success:
pipe, tokenizer, init_success = initialize_pipeline(request.model)
if init_success:
model_name = request.model
return {"status": "success", "message": message, "loaded": True, "current_model": model_name}
else:
return {"status": "success", "message": message, "loaded": False, "error": "初始化失败"}
else:
raise HTTPException(status_code=500, detail=message)
@app.post("/v1/chat/completions", response_model=ChatResponse)
async def chat_completions(request: ChatRequest):
"""聊天接口"""
global pipe, tokenizer, model_name
if request.model != model_name:
pipe, tokenizer, success = initialize_pipeline(request.model)
if not success:
raise HTTPException(status_code=500, detail="模型初始化失败")
model_name = request.model
try:
return create_chat_response(request, pipe, tokenizer)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
四、运行测试(见证奇迹)
4.1 启动服务
uvicorn app:app --host 0.0.0.0 --port 7860 --reload
看到这个就成功了:
🚀 正在加载: unsloth/functiongemma-270m-it
✅ 模型已存在
✅ Pipeline 初始化成功!
✅ 加载成功!
INFO: Uvicorn running on http://0.0.0.0:7860
4.2 测试函数调用
方法1:用浏览器
访问 http://localhost:7860/docs,直接在 Swagger UI 里测试
方法2:用 curl
curl -X POST "http://localhost:7860/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "user", "content": "查询北京天气"},
{"role": "system", "content": "使用 get_weather(city) 函数"}
],
"max_tokens": 100
}'
方法3:用 Python
import requests
response = requests.post("http://localhost:7860/v1/chat/completions", json={
"messages": [
{"role": "user", "content": "查询北京天气"},
{"role": "system", "content": "使用 get_weather(city) 函数"}
],
"max_tokens": 100
})
print(response.json())
4.3 如果模型没下载?
先下载:
curl -X POST "http://localhost:7860/download" \
-H "Content-Type: application/json" \
-d '{"model": "unsloth/functiongemma-270m-it"}'
五、部署到云端(HuggingFace Space)
5.1 准备 Dockerfile
FROM python:3.9-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
EXPOSE 7860
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
5.2 准备 requirements.txt
fastapi
uvicorn[standard]
transformers
torch
accelerate
python-dotenv
python-multipart
huggingface_hub
5.3 推送到 HuggingFace Space
- 创建 Space:HuggingFace → Spaces → New → Docker
- 上传代码:
git init
git add .
git commit -m "v0.0.1"
git remote add origin https://huggingface.co/spaces/你的用户名/你的Space名称
git push -u origin main
- 等待构建:5-10 分钟
5.4 免费资源够用吗?
HuggingFace Space 免费版:
- CPU: 2核
- 内存: 16GB
- 存储: 10GB
Gemma-270M 需求:
- 模型大小: ~1GB
- 运行内存: ~3-4GB
- ✅ 完全够用!
六、常见问题
Q1: 下载模型很慢?
# 用国内镜像
export HF_ENDPOINT=https://hf-mirror.com
Q2: 内存不够?
- 换更小的模型
- 使用量化版本
- 增加 Swap
Q3: 为什么不用 Ollama?
Ollama 很好,但:
- 模型库有限
- Transformers 支持 HuggingFace 所有模型
- 部署更灵活
Q4: 如何换其他模型?
修改 .env:
DEFAULT_MODEL_NAME="其他模型名称"
重启服务即可。
七、下一步?
现在你有了一个能用的函数调用服务,可以:
- 测试更多模型:HuggingFace 上有成千上万的模型
- 添加更多函数:天气、数据库、API 调用等
- 集成到应用:Web、App、小程序都可以
核心优势:简单、灵活、免费。快速验证想法,再决定要不要花钱升级。
八、项目文件清单
my_gemma_service/
├── .env # 配置模型名称
├── app.py # 主程序(50行)
├── utils/
│ ├── chat_request.py # 请求验证(10行)
│ ├── chat_response.py # 响应生成(50行)
│ └── model.py # 模型管理(60行)
├── requirements.txt # 依赖
├── Dockerfile # 部署用
└── my_model_cache/ # 模型缓存(自动生成)
总代码量:约 170 行
版本: v0.0.1
状态: ✅ 可运行
更新时间: 2026-01-01
有问题随时问我!🚀