AlignScore Service — Pseudocode

Config CONSTANTS & STARTUP

ALIGNSCORE SERVICE
==================

CONSTANTS:
    MODEL_PATH  ← path to AlignScore-base.ckpt
    DEVICE      ← "cpu"
    BATCH_SIZE  ← 32

LOAD ONCE AT STARTUP:
    model     ← BERTAlignModel (RoBERTa-base + 3 linear heads)
    tokenizer ← RoBERTa tokenizer
    softmax   ← Softmax over last dim
    nlp       ← spaCy English model

Helper chunk_text(sentences, n)

══════════════════════════════════════════════════════
HELPER: chunk_text(sentences, n)
    → yield joined groups of n sentences at a time

    EXAMPLE:
        sentences = ["S1.", "S2.", "S3.", "S4.", "S5.", "S6."]
        n = 2
        → chunks = ["S1. S2.", "S3. S4.", "S5. S6."]

Core inference_core(premise_list, hypo_list)

══════════════════════════════════════════════════════════════
CORE: inference_core(premise_list, hypo_list)
──────────────────────────────────────────────────────────────

INPUT:
    premise_list = [chunk_0, chunk_0, chunk_0,
                    chunk_1, chunk_1, chunk_1,
                    chunk_2, chunk_2, chunk_2]   shape = [9]

    hypo_list    = [sent_0,  sent_1,  sent_2,
                    sent_0,  sent_1,  sent_2,
                    sent_0,  sent_1,  sent_2]    shape = [9]

──────────────────────────────────────────────────────────────
STEP 1: Split into mini-batches of BATCH_SIZE (e.g. 32)

    batch_0 = pairs [0..8]   ← all 9 fit in one batch here

──────────────────────────────────────────────────────────────
STEP 2: FOR each mini-batch → tokenize each pair

    pair_0: (chunk_0, sent_0) → [CLS] chunk_0 [SEP] sent_0 [SEP]
    pair_1: (chunk_0, sent_1) → [CLS] chunk_0 [SEP] sent_1 [SEP]
    pair_2: (chunk_0, sent_2) → [CLS] chunk_0 [SEP] sent_2 [SEP]
    pair_3: (chunk_1, sent_0) → [CLS] chunk_1 [SEP] sent_0 [SEP]
    pair_4: (chunk_1, sent_1) → [CLS] chunk_1 [SEP] sent_1 [SEP]
    pair_5: (chunk_1, sent_2) → [CLS] chunk_1 [SEP] sent_2 [SEP]
    pair_6: (chunk_2, sent_0) → [CLS] chunk_2 [SEP] sent_0 [SEP]
    pair_7: (chunk_2, sent_1) → [CLS] chunk_2 [SEP] sent_1 [SEP]
    pair_8: (chunk_2, sent_2) → [CLS] chunk_2 [SEP] sent_2 [SEP]

    EXAMPLE (pair_0):
        premise = "DeepInfer infers preconditions from DNNs."
        hypo    = "DeepInfer is a trustworthy AI tool."
        tokens  = [CLS] DeepInfer infers ... [SEP] DeepInfer is ... [SEP]
        padded/truncated to 512 tokens

──────────────────────────────────────────────────────────────
STEP 3: Feed all 9 pairs into RoBERTa

    input shape  = [9 × 512 tokens]
                         ↓
                   RoBERTa (12 layers)
                         ↓
    pooler_output = CLS vector per pair
    output shape = [9 × 768]

──────────────────────────────────────────────────────────────
STEP 4: dropout → tri_layer → logits

    tri_layer(dropout(pooler_output))
    output shape = [9 × 3]

           ALIGNED  CONTRADICT  NEUTRAL
    pair_0 [  2.10,    -0.30,    0.50 ]   chunk_0 × sent_0
    pair_1 [  0.40,     0.80,    0.60 ]   chunk_0 × sent_1
    pair_2 [  0.30,     0.90,    0.80 ]   chunk_0 × sent_2
    pair_3 [  0.80,     0.50,    0.60 ]   chunk_1 × sent_0
    pair_4 [  1.30,    -0.10,    0.30 ]   chunk_1 × sent_1
    pair_5 [  0.60,     0.70,    0.50 ]   chunk_1 × sent_2
    pair_6 [  0.70,     0.60,    0.50 ]   chunk_2 × sent_0
    pair_7 [  1.00,     0.20,    0.30 ]   chunk_2 × sent_1
    pair_8 [  1.60,    -0.20,    0.10 ]   chunk_2 × sent_2

──────────────────────────────────────────────────────────────
STEP 5: softmax → probabilities

    output shape = [9 × 3]

           ALIGNED  CONTRADICT  NEUTRAL
    pair_0 [ 0.906,    0.044,    0.050 ]   chunk_0 × sent_0
    pair_1 [ 0.310,    0.500,    0.190 ]   chunk_0 × sent_1
    pair_2 [ 0.240,    0.560,    0.200 ]   chunk_0 × sent_2
    pair_3 [ 0.420,    0.380,    0.200 ]   chunk_1 × sent_0
    pair_4 [ 0.531,    0.269,    0.200 ]   chunk_1 × sent_1
    pair_5 [ 0.350,    0.450,    0.200 ]   chunk_1 × sent_2
    pair_6 [ 0.380,    0.420,    0.200 ]   chunk_2 × sent_0
    pair_7 [ 0.480,    0.320,    0.200 ]   chunk_2 × sent_1
    pair_8 [ 0.553,    0.247,    0.200 ]   chunk_2 × sent_2

──────────────────────────────────────────────────────────────
STEP 6: [:, 0] → grab ALIGNED column only

    output_score = [0.906, 0.310, 0.240,
                    0.420, 0.531, 0.350,
                    0.380, 0.480, 0.553]   shape = [9]

──────────────────────────────────────────────────────────────
STEP 7: Concatenate all batch scores → return flat tensor

    RETURN [0.906, 0.310, 0.240, 0.420, 0.531, 0.350, 0.380, 0.480, 0.553]

Core inference_per_example(paper_text, summary_text)

══════════════════════════════════════════════════════════════
CORE: inference_per_example(paper_text, summary_text)
──────────────────────────────────────────────────────────────

STEP 1: sentence_tokenize(paper_text) → premise_sents

    paper_text = "DeepInfer proposes a technique to infer preconditions.
                  It uses weakest precondition calculus.
                  Results show recall 0.98. Runtime is 0.22s."

    premise_sents = [
        "DeepInfer proposes a technique to infer preconditions.",
        "It uses weakest precondition calculus.",
        "Results show recall 0.98.",
        "Runtime is 0.22s."
    ]
    len(premise_sents) = 4

──────────────────────────────────────────────────────────────
STEP 2: compute chunk size → group into ~350-word chunks

    word_count  = 4200
    n_chunk     = 4200 // 350 + 1 = 13   ← sentences per chunk
    total sents = 39

    chunk_0 = sents[0:13]   "DeepInfer proposes... [13 sents]"
    chunk_1 = sents[13:26]  "We evaluate on 29 models... [13 sents]"
    chunk_2 = sents[26:39]  "Results show recall 0.98... [13 sents]"

    premise_chunks = [chunk_0, chunk_1, chunk_2]
    len(premise_chunks) = 3

──────────────────────────────────────────────────────────────
STEP 3: sentence_tokenize(summary_text) → hypo_sents

    summary_text = "DeepInfer infers preconditions. Violations
                    correlate 0.88 with errors. Recall is 0.98."

    hypo_sents = [
        "DeepInfer infers preconditions.",
        "Violations correlate 0.88 with errors.",
        "Recall is 0.98."
    ]
    len(hypo_sents) = 3

──────────────────────────────────────────────────────────────
STEP 4: BUILD flat pair lists (all chunk × sentence combos)

    premise_sent_mat = [chunk_0, chunk_0, chunk_0,   ← repeat each chunk
                        chunk_1, chunk_1, chunk_1,      once per hypo sent
                        chunk_2, chunk_2, chunk_2]

    hypo_sents_mat   = [sent_0,  sent_1,  sent_2,    ← cycle through sents
                        sent_0,  sent_1,  sent_2,       for every chunk
                        sent_0,  sent_1,  sent_2]

    total pairs = 3 chunks × 3 sents = 9

──────────────────────────────────────────────────────────────
STEP 5: output_score = inference_core(premise_sent_mat,
                                      hypo_sents_mat)[:, 0]

    inference_core returns [9 pairs × 3 labels]:

           ALIGNED  CONTRADICT  NEUTRAL
    pair_0 [ 0.906,   0.044,    0.050 ]   chunk_0 × sent_0
    pair_1 [ 0.310,   0.500,    0.190 ]   chunk_0 × sent_1
    pair_2 [ 0.240,   0.560,    0.200 ]   chunk_0 × sent_2
    pair_3 [ 0.420,   0.380,    0.200 ]   chunk_1 × sent_0
    pair_4 [ 0.531,   0.269,    0.200 ]   chunk_1 × sent_1
    pair_5 [ 0.350,   0.450,    0.200 ]   chunk_1 × sent_2
    pair_6 [ 0.380,   0.420,    0.200 ]   chunk_2 × sent_0
    pair_7 [ 0.480,   0.320,    0.200 ]   chunk_2 × sent_1
    pair_8 [ 0.553,   0.247,    0.200 ]   chunk_2 × sent_2

    [:, 0] → flat ALIGNED scores:
    output_score = [0.906, 0.310, 0.240,
                    0.420, 0.531, 0.350,
                    0.380, 0.480, 0.553]   shape = [9]

──────────────────────────────────────────────────────────────
STEP 6: .view(3, 3) → reshape into score_matrix

    output_score.view(len(premise_chunks), len(hypo_sents))

                  sent_0   sent_1   sent_2
    chunk_0     [ 0.906,   0.310,   0.240 ]
    chunk_1     [ 0.420,   0.531,   0.350 ]
    chunk_2     [ 0.380,   0.480,   0.553 ]

──────────────────────────────────────────────────────────────
STEP 7: .max(dim=0) → best chunk per sentence (collapse rows)

    sent_0: max(0.906, 0.420, 0.380) = 0.906  ← chunk_0 wins
    sent_1: max(0.310, 0.531, 0.480) = 0.531  ← chunk_1 wins
    sent_2: max(0.240, 0.350, 0.553) = 0.553  ← chunk_2 wins

    best_scores = [0.906, 0.531, 0.553]

──────────────────────────────────────────────────────────────
STEP 8: .mean().item() → single final score

    mean(0.906, 0.531, 0.553) = 0.663

    RETURN 0.663

Service get_align_score(paper_text, summary_text)

══════════════════════════════════════════════════════════════
SERVICE: get_align_score(paper_text, summary_text)
──────────────────────────────────────────────────────────────

    score       = inference_per_example(paper, summary)
                = 0.663

    confidence  = get_confidence(0.663)
                = "Medium"     ← 0.6 ≤ 0.663 < 0.8

    is_reliable = 0.663 ≥ 0.7
                = False

    RETURN (0.663, "Medium", False)

Service get_align_score_breakdown(paper, summary, top_k=3)

══════════════════════════════════════════════════════════════
SERVICE: get_align_score_breakdown(paper, summary, top_k=3)
──────────────────────────────────────────────────────────────

    [same STEPS 1–6 above → reuse score_matrix]

                  sent_0   sent_1   sent_2
    chunk_0     [ 0.906,   0.310,   0.240 ]
    chunk_1     [ 0.420,   0.531,   0.350 ]
    chunk_2     [ 0.380,   0.480,   0.553 ]

    FOR sent_0 "DeepInfer infers preconditions.":
        col = [0.906, 0.420, 0.380]
        best_chunk = chunk_0  @ 0.906
        top_3 = [(chunk_0, 0.906), (chunk_1, 0.420), (chunk_2, 0.380)]

    FOR sent_1 "Violations correlate 0.88 with errors.":
        col = [0.310, 0.531, 0.480]
        best_chunk = chunk_1  @ 0.531
        top_3 = [(chunk_1, 0.531), (chunk_2, 0.480), (chunk_0, 0.310)]

    FOR sent_2 "Recall is 0.98.":
        col = [0.240, 0.350, 0.553]
        best_chunk = chunk_2  @ 0.553
        top_3 = [(chunk_2, 0.553), (chunk_1, 0.350), (chunk_0, 0.240)]

    RETURN {
        align_score:         0.663,
        confidence:          "Medium",
        is_reliable:         False,
        num_context_chunks:  3,
        num_claim_sentences: 3,
        per_sentence: [
            { sent_0, best=chunk_0 @ 0.906, top_3=[...] },
            { sent_1, best=chunk_1 @ 0.531, top_3=[...] },
            { sent_2, best=chunk_2 @ 0.553, top_3=[...] }
        ]
    }