Spaces:

Czjun
/

Transformer

Running

czjun commited on 11 days ago

Commit

8d28a45

1 Parent(s): ac5d6e0

Update README and implement training and evaluation scripts for Chinese summarization model

- Updated README.md to include new training and evaluation instructions.
- Changed default model name in app.py to `fnlp/bart-base-chinese` and adjusted max source length.
- Added a new endpoint `/summarize-plain` in app.py for plain text summarization.
- Updated requirements.txt to include new dependencies: accelerate, rouge-score, and bert-score.
- Created data_utils.py for loading JSONL data and iterating through summarization examples.
- Implemented evaluate.py for model evaluation with ROUGE and BERTScore metrics.
- Developed train.py for fine-tuning the summarization model with specified parameters.
- Added error handling for missing dependencies in evaluation and training scripts.

Files changed (10) hide show

README.md +18 -1
__pycache__/app.cpython-310.pyc +0 -0
__pycache__/data_utils.cpython-310.pyc +0 -0
__pycache__/evaluate.cpython-310.pyc +0 -0
__pycache__/train.cpython-310.pyc +0 -0
app.py +32 -9
data_utils.py +34 -0
evaluate.py +151 -0
requirements.txt +3 -0
train.py +128 -0

README.md CHANGED Viewed

@@ -12,4 +12,21 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
 To force a specific transformer model in Spaces, set the `MODEL_NAME` environment variable, for example:
-`IDEA-CCNL/Randeng-T5-Char-57M-MultiTask-Chinese`

 To force a specific transformer model in Spaces, set the `MODEL_NAME` environment variable, for example:
+`fnlp/bart-base-chinese`
+## Training and evaluation
+For local fine-tuning and metric collection:
+```bash
+python train.py --train-path data/train.jsonl --valid-path data/valid.jsonl --output-dir outputs/bart_cn
+python evaluate.py --test-path data/test.jsonl --model-name outputs/bart_cn --output-csv metrics_report.csv
+```
+The evaluation script prints and exports:
+- `ROUGE-L`
+- `BERTScore`
+- `QAFactEval` when an external QAFactEval environment is available
+- length hit rate
+- average latency

__pycache__/app.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-310.pyc and b/__pycache__/app.cpython-310.pyc differ

__pycache__/data_utils.cpython-310.pyc ADDED Viewed

Binary file (1.27 kB). View file

__pycache__/evaluate.cpython-310.pyc ADDED Viewed

Binary file (5.44 kB). View file

__pycache__/train.cpython-310.pyc ADDED Viewed

Binary file (3.57 kB). View file

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import List, Optional
 from fastapi import FastAPI
 from fastapi.responses import HTMLResponse
 from pydantic import BaseModel, Field
 try:
@@ -30,8 +31,8 @@ class SummaryOutput:
 class SummarizationConfig:
-    model_name: str = os.getenv("MODEL_NAME", "IDEA-CCNL/Randeng-T5-Char-57M-MultiTask-Chinese")
-    max_source_length: int = 1024
     max_target_length: int = 160
     num_beams: int = 4
     no_repeat_ngram_size: int = 3
@@ -151,7 +152,7 @@ class HybridSummarizer:
             prompt,
             return_tensors="pt",
             truncation=True,
-            max_length=512,
         )
         inputs.pop("token_type_ids", None)
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
@@ -209,6 +210,20 @@ def summarize(req: SummarizeRequest):
     )
 @app.get("/")
 def root():
     error_note = f"<p>最近一次生成错误：<code>{engine.load_error}</code></p>" if engine.load_error else ""
@@ -289,11 +304,18 @@ def root():
           border-radius: 6px;
         }
         pre {
-          background: #0f172a;
-          color: #e2e8f0;
           padding: 16px;
           border-radius: 12px;
           overflow-x: auto;
         }
         .meta {
           color: #6b7280;
@@ -319,17 +341,18 @@ def root():
           <div class="guide">
             <h2>使用指南</h2>
             <p>1. 点击 <code>打开接口文档</code>，进入 Swagger 页面。</p>
-            <p>2. 找到 <code>POST /summarize</code>，点击 <code>Try it out</code>。</p>
-            <p>3. 在请求体中填写文本和目标长度，例如：</p>
-            <pre><code>{
   "text": "这里放一段较长的中文文本",
   "target_length": 120
 }</code></pre>
           <p>4. 点击 <code>Execute</code> 后查看返回的摘要结果。</p>
           <p>5. 如果想确认服务是否正常，可点击 <code>检查服务状态</code>，返回 <code>ok</code> 即表示运行正常。</p>
           <p>6. 如果接口返回 <code>backend=fallback</code>，请查看响应里的 <code>error</code> 字段，这表示 Transformer 生成阶段失败，系统才会自动切回备用摘要。</p>
             <div class="meta">
-              提示：如果文本里有换行，请确保是合法 JSON。建议直接在 Swagger 页面提交，避免手写 JSON 出错。
             </div>
           </div>
         </div>

 from fastapi import FastAPI
 from fastapi.responses import HTMLResponse
+from fastapi import Body, Query
 from pydantic import BaseModel, Field
 try:
 class SummarizationConfig:
+    model_name: str = os.getenv("MODEL_NAME", "fnlp/bart-base-chinese")
+    max_source_length: int = 512
     max_target_length: int = 160
     num_beams: int = 4
     no_repeat_ngram_size: int = 3
             prompt,
             return_tensors="pt",
             truncation=True,
+            max_length=SummarizationConfig.max_source_length,
         )
         inputs.pop("token_type_ids", None)
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
     )
+@app.post("/summarize-plain", response_model=SummarizeResponse)
+def summarize_plain(
+    text: str = Body(..., media_type="text/plain", description="直接粘贴原文，支持换行和空格"),
+    target_length: int = Query(120, ge=1, description="目标摘要长度"),
+):
+    result = engine.summarize(text, target_length=target_length)
+    return SummarizeResponse(
+        summary=result.summary,
+        backend=result.backend,
+        target_length=result.used_target_length,
+        error=result.error,
+    )
 @app.get("/")
 def root():
     error_note = f"<p>最近一次生成错误：<code>{engine.load_error}</code></p>" if engine.load_error else ""
           border-radius: 6px;
         }
         pre {
+          background: #f8fafc;
+          color: #111827;
           padding: 16px;
           border-radius: 12px;
           overflow-x: auto;
+          border: 1px solid rgba(148, 163, 184, 0.25);
+        }
+        pre code {
+          background: transparent;
+          padding: 0;
+          border-radius: 0;
+          color: inherit;
         }
         .meta {
           color: #6b7280;
           <div class="guide">
             <h2>使用指南</h2>
             <p>1. 点击 <code>打开接口文档</code>，进入 Swagger 页面。</p>
+          <p>2. 找到 <code>POST /summarize</code>，点击 <code>Try it out</code>。</p>
+          <p>3. 在请求体中填写文本和目标长度，例如：</p>
+          <pre><code>{
   "text": "这里放一段较长的中文文本",
   "target_length": 120
 }</code></pre>
           <p>4. 点击 <code>Execute</code> 后查看返回的摘要结果。</p>
           <p>5. 如果想确认服务是否正常，可点击 <code>检查服务状态</code>，返回 <code>ok</code> 即表示运行正常。</p>
           <p>6. 如果接口返回 <code>backend=fallback</code>，请查看响应里的 <code>error</code> 字段，这表示 Transformer 生成阶段失败，系统才会自动切回备用摘要。</p>
+          <p>7. 如果原文包含大量换行或空格，建议直接使用 <code>POST /summarize-plain</code>，把正文当作纯文本提交，更适合粘贴文章正文。</p>
             <div class="meta">
+              提示：<code>/summarize</code> 走 JSON，<code>/summarize-plain</code> 走纯文本。前者适合结构化调用，后者适合直接粘贴文章。
             </div>
           </div>
         </div>

data_utils.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, List
+@dataclass
+class SummarizationExample:
+    article: str
+    summary: str
+def load_jsonl(path: str | Path) -> List[SummarizationExample]:
+    path = Path(path)
+    items: List[SummarizationExample] = []
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            obj = json.loads(line)
+            article = obj.get("article") or obj.get("text") or ""
+            summary = obj.get("summary") or obj.get("label") or ""
+            if article and summary:
+                items.append(SummarizationExample(article=article, summary=summary))
+    return items
+def iter_pairs(examples: Iterable[SummarizationExample]):
+    for ex in examples:
+        yield ex.article, ex.summary

evaluate.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from __future__ import annotations
+import argparse
+import csv
+import time
+from pathlib import Path
+try:
+    from bert_score import score as bertscore
+    from rouge_score import rouge_scorer
+    import torch
+    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+except Exception as exc:  # pragma: no cover
+    raise SystemExit(
+        "Evaluation requires bert-score, rouge-score, torch and transformers. Install dependencies first."
+    ) from exc
+from data_utils import load_jsonl
+def parse_args():
+    parser = argparse.ArgumentParser(description="Evaluate summarization models")
+    parser.add_argument("--test-path", required=True)
+    parser.add_argument("--model-name", default="fnlp/bart-base-chinese")
+    parser.add_argument("--max-source-length", type=int, default=512)
+    parser.add_argument("--target-length", type=int, default=120)
+    parser.add_argument("--tolerance", type=float, default=0.2)
+    parser.add_argument("--output-csv", default="metrics_report.csv")
+    parser.add_argument("--qafacteval-model-folder", default=None)
+    return parser.parse_args()
+def length_hit(text: str, target_length: int, tolerance: float) -> bool:
+    low = int(target_length * (1 - tolerance))
+    high = int(target_length * (1 + tolerance))
+    return low <= len(text) <= high
+def try_qafacteval(model_folder: str | None, sources, preds):
+    if not model_folder:
+        return [None] * len(preds)
+    try:
+        from qafacteval import QAFactEval
+    except Exception:
+        return [None] * len(preds)
+    metric = QAFactEval(
+        lerc_quip_path=f"{model_folder}/quip-512-mocha",
+        generation_model_path=f"{model_folder}/generation/model.tar.gz",
+        answering_model_dir=f"{model_folder}/answering",
+        lerc_model_path=f"{model_folder}/lerc/model.tar.gz",
+        lerc_pretrained_model_path=f"{model_folder}/lerc/pretraining.tar.gz",
+        cuda_device=0 if torch.cuda.is_available() else -1,
+        use_lerc_quip=True,
+        verbose=False,
+        generation_batch_size=8,
+        answering_batch_size=8,
+        lerc_batch_size=4,
+    )
+    results = metric.score_batch(list(sources), [[p] for p in preds], return_qa_pairs=True)
+    scores = []
+    for row in results:
+        item = row[0]["qa-eval"].get("lerc_quip")
+        scores.append(item)
+    return scores
+def main():
+    args = parse_args()
+    examples = load_jsonl(args.test_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+    model.eval()
+    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)
+    sources = []
+    refs = []
+    preds = []
+    times_ms = []
+    length_flags = []
+    for ex in examples:
+        inputs = tokenizer(
+            ex.article,
+            return_tensors="pt",
+            truncation=True,
+            max_length=args.max_source_length,
+        )
+        inputs.pop("token_type_ids", None)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        start = time.perf_counter()
+        with torch.no_grad():
+            out = model.generate(
+                **inputs,
+                max_new_tokens=max(48, min(192, int(args.target_length * 1.1))),
+                num_beams=4,
+                no_repeat_ngram_size=3,
+                length_penalty=1.0,
+                early_stopping=True,
+            )
+        elapsed_ms = (time.perf_counter() - start) * 1000
+        pred = tokenizer.decode(out[0], skip_special_tokens=True).strip()
+        sources.append(ex.article)
+        refs.append(ex.summary)
+        preds.append(pred)
+        times_ms.append(elapsed_ms)
+        length_flags.append(length_hit(pred, args.target_length, args.tolerance))
+    rouge_ls = [scorer.score(ref, pred)["rougeL"].fmeasure for ref, pred in zip(refs, preds)]
+    P, R, F1 = bertscore(preds, refs, lang="zh", verbose=False)
+    qafacteval_scores = try_qafacteval(args.qafacteval_model_folder, sources, preds)
+    rouge_l = sum(rouge_ls) / max(1, len(rouge_ls))
+    bert_f1 = float(F1.mean().item()) if hasattr(F1.mean(), "item") else float(F1.mean())
+    length_rate = sum(1 for v in length_flags if v) / max(1, len(length_flags))
+    avg_latency = sum(times_ms) / max(1, len(times_ms))
+    qafacteval_valid = [s for s in qafacteval_scores if s is not None]
+    qafacteval_avg = sum(qafacteval_valid) / len(qafacteval_valid) if qafacteval_valid else None
+    print(f"ROUGE-L: {rouge_l:.4f}")
+    print(f"BERTScore: {bert_f1:.4f}")
+    print(f"Length Hit Rate: {length_rate:.4f}")
+    print(f"Avg Latency(ms): {avg_latency:.2f}")
+    if qafacteval_avg is not None:
+        print(f"QAFactEval: {qafacteval_avg:.4f}")
+    else:
+        print("QAFactEval: N/A")
+    out_path = Path(args.output_csv)
+    with out_path.open("w", newline="", encoding="utf-8") as f:
+        writer = csv.writer(f)
+        writer.writerow(["model", "rouge_l", "bertscore", "qafacteval", "length_hit_rate", "avg_latency_ms"])
+        writer.writerow(
+            [
+                args.model_name,
+                f"{rouge_l:.4f}",
+                f"{bert_f1:.4f}",
+                f"{qafacteval_avg:.4f}" if qafacteval_avg is not None else "",
+                f"{length_rate:.4f}",
+                f"{avg_latency:.2f}",
+            ]
+        )
+    print(f"saved metrics to {out_path}")
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -5,3 +5,6 @@ transformers>=4.41.0
 sentencepiece>=0.2.0
 torch>=2.1.0
 protobuf>=4.25.0

 sentencepiece>=0.2.0
 torch>=2.1.0
 protobuf>=4.25.0
+accelerate>=0.30.0
+rouge-score>=0.1.2
+bert-score>=0.3.13

train.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from __future__ import annotations
+import argparse
+from dataclasses import asdict
+from pathlib import Path
+try:
+    import torch
+    from torch.utils.data import Dataset
+    from transformers import (
+        AutoModelForSeq2SeqLM,
+        AutoTokenizer,
+        DataCollatorForSeq2Seq,
+        Seq2SeqTrainer,
+        Seq2SeqTrainingArguments,
+    )
+except Exception as exc:  # pragma: no cover
+    raise SystemExit(
+        "Training requires torch, transformers and accelerate. Install dependencies first."
+    ) from exc
+from data_utils import load_jsonl
+class JsonlSeq2SeqDataset(Dataset):
+    def __init__(self, path, tokenizer, max_source_length: int, max_target_length: int):
+        self.examples = load_jsonl(path)
+        self.tokenizer = tokenizer
+        self.max_source_length = max_source_length
+        self.max_target_length = max_target_length
+    def __len__(self):
+        return len(self.examples)
+    def __getitem__(self, idx):
+        ex = self.examples[idx]
+        model_inputs = self.tokenizer(
+            ex.article,
+            max_length=self.max_source_length,
+            truncation=True,
+        )
+        labels = self.tokenizer(
+            text_target=ex.summary,
+            max_length=self.max_target_length,
+            truncation=True,
+        )
+        model_inputs["labels"] = labels["input_ids"]
+        return model_inputs
+def parse_args():
+    parser = argparse.ArgumentParser(description="Fine-tune a Chinese seq2seq summarization model")
+    parser.add_argument("--train-path", required=True)
+    parser.add_argument("--valid-path", default=None)
+    parser.add_argument("--output-dir", required=True)
+    parser.add_argument("--model-name", default="fnlp/bart-base-chinese")
+    parser.add_argument("--max-source-length", type=int, default=512)
+    parser.add_argument("--max-target-length", type=int, default=128)
+    parser.add_argument("--num-train-epochs", type=float, default=3.0)
+    parser.add_argument("--train-batch-size", type=int, default=2)
+    parser.add_argument("--eval-batch-size", type=int, default=2)
+    parser.add_argument("--learning-rate", type=float, default=3e-5)
+    parser.add_argument("--logging-steps", type=int, default=25)
+    parser.add_argument("--save-steps", type=int, default=200)
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name)
+    train_dataset = JsonlSeq2SeqDataset(
+        args.train_path,
+        tokenizer,
+        max_source_length=args.max_source_length,
+        max_target_length=args.max_target_length,
+    )
+    eval_dataset = (
+        JsonlSeq2SeqDataset(
+            args.valid_path,
+            tokenizer,
+            max_source_length=args.max_source_length,
+            max_target_length=args.max_target_length,
+        )
+        if args.valid_path
+        else None
+    )
+    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
+    training_args = Seq2SeqTrainingArguments(
+        output_dir=str(output_dir),
+        learning_rate=args.learning_rate,
+        per_device_train_batch_size=args.train_batch_size,
+        per_device_eval_batch_size=args.eval_batch_size,
+        predict_with_generate=True,
+        num_train_epochs=args.num_train_epochs,
+        logging_steps=args.logging_steps,
+        save_steps=args.save_steps,
+        save_total_limit=2,
+        evaluation_strategy="steps" if eval_dataset else "no",
+        eval_steps=args.save_steps if eval_dataset else None,
+        fp16=torch.cuda.is_available(),
+        report_to=[],
+    )
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+    )
+    trainer.train()
+    trainer.save_model(str(output_dir))
+    tokenizer.save_pretrained(str(output_dir))
+    print(f"saved to {output_dir}")
+if __name__ == "__main__":
+    main()