How to Build Smarter Multilingual Text Wrapping with BudouX Through Parsing, HTML Rendering, Model Introspection, and Toy Training

Editor
3 Min Read


def adaboost(X, y, rounds=80):
   n = len(y)
   w = [1/n]*n
   feat_set = sorted({f for fx in X for f in fx})
   fmap = [set(fx) for fx in X]
   model_rounds = []
   for r in range(rounds):
       best_feat, best_err, best_pol = None, 1.0, 1
       for f in feat_set:
           err_pos = sum(w[i] for i in range(n) if (f in fmap[i]) != (y[i]==1))
           err_neg = 1 - err_pos
           if err_pos < best_err: best_feat, best_err, best_pol = f, err_pos, +1
           if err_neg < best_err: best_feat, best_err, best_pol = f, err_neg, -1
       if best_err >= 0.5 - 1e-9: break
       eps = max(best_err, 1e-6)
       alpha = 0.5 * ( (1-eps)/eps ) ** 0.5
       new_w = []
       for i in range(n):
           pred = best_pol if best_feat in fmap[i] else -best_pol
           new_w.append(w[i] * (0.5 if pred == y[i] else 2.0))
       s = sum(new_w); w = [x/s for x in new_w]
       model_rounds.append((best_feat, best_pol, alpha))
   return model_rounds


print("Training (this is a toy trainer — be patient ~10s)...")
t0 = time.perf_counter()
rounds = adaboost(X, y, rounds=60)
print(f"Done in {time.perf_counter()-t0:.1f}s, {len(rounds)} stumps kept.")


correct = 0
for fx, label in zip(X, y):
   score = sum(a if (f in fx) == (p==1) else -a for f,p,a in rounds)
   pred = 1 if score > 0 else -1
   correct += (pred == label)
print(f"Training accuracy of toy model: {correct/len(X)*100:.1f}%")
print("👉 For a production model, use `scripts/train.py` from the BudouX repo with the matching feature extractor — this section is illustrative.")


header("8️⃣ Real-world demo — narrow column comparison")


paragraph = ("BudouXはGoogleが開発したオープンソースの改行ライブラリです。"
            "機械学習モデルを使って、文章を意味のあるフレーズに分割し、"
            "読みやすい位置でのみ改行が起こるようにします。"
            "依存関係がなく軽量なため、ウェブサイトやモバイルアプリに"
            "簡単に組み込むことができます。")
display(HTML(f"""
<div style="display:flex; gap:24px; font-family:'Hiragino Sans','Yu Gothic',sans-serif; font-size:15px;">
 <div style="flex:1; border:2px solid #c33; padding:12px; max-width:180px;">
   <b style="color:#c33;">Without BudouX</b>
   <p style="line-height:1.7;">{paragraph}</p>
 </div>
 <div style="flex:1; border:2px solid #2a8; padding:12px; max-width:180px;">
   <b style="color:#2a8;">With BudouX</b>
   <p style="line-height:1.7;">{ja_parser.translate_html_string(paragraph)}</p>
 </div>
</div>
<p style="font-size:12px;color:#666;">Resize the browser/Colab pane to see the difference more clearly — BudouX never breaks a phrase mid-word.</p>
"""))


print("\n🌸 Tutorial complete. Try plugging BudouX output into your own UI.")
Share this Article
Please enter CoinGecko Free Api Key to get this plugin works.