Spaces:

NohTow
/

LLM_watermarking

Runtime error

File size: 12,365 Bytes

ed02397

import transformers
from transformers import AutoTokenizer
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)
from transformers import pipeline, set_seed, LogitsProcessor
from transformers.generation.logits_process import TopPLogitsWarper, TopKLogitsWarper
import torch
from scipy.special import gamma, gammainc, gammaincc, betainc
from scipy.optimize import fminbound
import numpy as np

import os

hf_token = os.getenv('HF_TOKEN')


device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

def hash_tokens(input_ids: torch.LongTensor, key: int):
    seed = key
    salt = 35317
    for i in input_ids:
        seed = (seed * salt + i.item()) % (2 ** 64 - 1)
    return seed

class WatermarkingLogitsProcessor(LogitsProcessor):
    def __init__(self, n, key, messages, window_size, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.batch_size = len(messages)
        self.generators = [ torch.Generator(device=device) for _ in range(self.batch_size) ]

        self.n = n
        self.key = key
        self.window_size = window_size
        if not self.window_size:
            for b in range(self.batch_size):
                self.generators[b].manual_seed(self.key)

        self.messages = messages

class WatermarkingAaronsonLogitsProcessor( WatermarkingLogitsProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:       
        # get random uniform variables
        B, V = scores.shape

        r = torch.zeros_like(scores)
        for b in range(B):
            if self.window_size:
                window = input_ids[b, -self.window_size:]
                seed = hash_tokens(window, self.key)
                self.generators[b].manual_seed(seed)
            r[b] = torch.rand(self.n, generator=self.generators[b], device=self.generators[b].device).log().roll(-self.messages[b])
        # generate n but keep only V, as we want to keep the pseudo-random sequences in sync with the decoder
        r = r[:,:V]

        # modify law as r^(1/p)
        # Since we want to return logits (logits processor takes and outputs logits),
        # we return log(q), hence torch.log(r) * torch.log(torch.exp(1/p)) = torch.log(r) / p
        return r / scores.exp()

class WatermarkingKirchenbauerLogitsProcessor(WatermarkingLogitsProcessor):
    def __init__(self, *args,
                 gamma = 0.5,
                 delta = 4.0,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.gamma = gamma
        self.delta = delta

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        B, V = scores.shape

        for b in range(B):
            if self.window_size:
                window = input_ids[b, -self.window_size:]
                seed = hash_tokens(window, self.key)
                self.generators[b].manual_seed(seed)
            vocab_permutation = torch.randperm(self.n, generator=self.generators[b], device=self.generators[b].device)
            greenlist = vocab_permutation[:int(self.gamma * self.n)] # gamma * n
            bias = torch.zeros(self.n).to(scores.device)
            bias[greenlist] = self.delta
            bias = bias.roll(-self.messages[b])[:V]
            scores[b] += bias # add bias to greenlist words

        return scores

class Watermarker(object):
    def __init__(self, modelname="facebook/opt-350m", window_size = 0, payload_bits = 0, logits_processor = None, *args, **kwargs):
        self.tokenizer = AutoTokenizer.from_pretrained(modelname, use_auth_token=hf_token)
        self.model = AutoModelForCausalLM.from_pretrained(modelname, use_auth_token=hf_token).to(device)
        self.model.eval()
        self.window_size = window_size

        # preprocessing wrappers
        self.logits_processor = logits_processor or []

        self.payload_bits = payload_bits
        self.V = max(2**payload_bits, self.model.config.vocab_size)
        self.generator = torch.Generator(device=device)


    def embed(self, key=42, messages=[1234], prompt="", max_length=30, method='aaronson'):

        B = len(messages) # batch size
        length = max_length
         
        # compute capacity
        if self.payload_bits:
            assert min([message >= 0 and message < 2**self.payload_bits for message in messages])

        # tokenize prompt
        inputs = self.tokenizer([ prompt ] * B, return_tensors="pt")

        if method == 'aaronson':
            # generate with greedy search
            generated_ids = self.model.generate(inputs.input_ids.to(device), max_length=max_length, do_sample=False,
                                                logits_processor = self.logits_processor + [
                                                    WatermarkingAaronsonLogitsProcessor(n=self.V,
                                                                                        key=key,
                                                                                        messages=messages,
                                                                                        window_size = self.window_size)])
        elif method == 'kirchenbauer':
            # use sampling
            generated_ids = self.model.generate(inputs.input_ids.to(device), max_length=max_length, do_sample=True,
                                                logits_processor = self.logits_processor + [
                                                    WatermarkingKirchenbauerLogitsProcessor(n=self.V,
                                                                                            key=key,
                                                                                            messages=messages,
                                                                                            window_size = self.window_size)])
        elif method == 'greedy':
            # generate with greedy search
            generated_ids = self.model.generate(inputs.input_ids.to(device), max_length=max_length, do_sample=False,
                                                logits_processor = self.logits_processor)
        elif method == 'sampling':
            # generate with greedy search
            generated_ids = self.model.generate(inputs.input_ids.to(device), max_length=max_length, do_sample=True,
                                                logits_processor = self.logits_processor)
        else:
           raise Exception('Unknown method %s' % method)
        decoded_texts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

        return decoded_texts
    
    def detect(self, attacked_texts, key=42, method='aaronson', gamma=0.5, prompts=None):
        if(prompts==None):
            prompts = [""] * len(attacked_texts)

        generator = self.generator

        #print("attacked_texts = ", attacked_texts)

        cdfs = []
        ms = []

        MAX = 2**self.payload_bits
        
        # tokenize input
        inputs = self.tokenizer(attacked_texts, return_tensors="pt", padding=True, return_attention_mask=True)
                        
        input_ids = inputs["input_ids"].to(self.model.device)
        attention_masks = inputs["attention_mask"].to(self.model.device)

        B,T = input_ids.shape

        if method == 'aaronson_neyman_pearson':
            # compute logits
            outputs = self.model.forward(input_ids, return_dict=True)
            logits = outputs['logits']
            # TODO
            # reapply logits processors to get same distribution
            #for i in range(T):
            #    for processor in self.logits_processor:
            #        logits[:,i] = processor(input_ids[:, :i], logits[:, i])

            probs = logits.softmax(dim=-1)
            ps = torch.gather(probs, 2, input_ids[:,1:,None]).squeeze_(-1)


        seq_len = input_ids.shape[1]
        length = seq_len

        V = self.V
                
        Z = torch.zeros(size=(B, V), dtype=torch.float32, device=device)


        # keep a history of contexts we have already seen,
        # to exclude them from score aggregation and allow
        # correct p-value computation under H0
        history = [set() for _ in range(B)]

        attention_masks_prompts = self.tokenizer(prompts, return_tensors="pt", padding=True, return_attention_mask=True)["attention_mask"]
        prompts_length = torch.sum(attention_masks_prompts, dim=1)
        for b in range(B):
            attention_masks[b, :prompts_length[b]] = 0
            if not self.window_size:
                generator.manual_seed(key)
            # We can go from seq_len - prompt_len, need to change +1 to + prompt_len
            for i in range(seq_len-1):
            
                if self.window_size:
                    window = input_ids[b, max(0, i-self.window_size+1):i+1]
                    #print("window = ", window)
                    seed = hash_tokens(window, key)
                    if seed not in history[b]:
                        generator.manual_seed(seed)
                        history[b].add(seed)
                    else:
                        # ignore the token
                        attention_masks[b, i+1] = 0

                if not attention_masks[b,i+1]:
                    continue

                token = int(input_ids[b,i+1])

                if method in {'aaronson', 'aaronson_simplified', 'aaronson_neyman_pearson'}:
                    R = torch.rand(V, generator = generator, device = generator.device)

                if method == 'aaronson':
                    r = -(1-R).log()
                elif method in {'aaronson_simplified', 'aaronson_neyman_pearson'}:
                    r = -R.log()
                elif method == 'kirchenbauer':
                    r = torch.zeros(V, device=device)
                    vocab_permutation = torch.randperm(V, generator = generator, device=generator.device)
                    greenlist = vocab_permutation[:int(gamma * V)]
                    r[greenlist] = 1
                else:
                    raise Exception('Unknown method %s' % method)

                if method in {'aaronson', 'aaronson_simplified', 'kirchenbauer'}:
                    # independent of probs
                    Z[b] += r.roll(-token)
                elif method == 'aaronson_neyman_pearson':
                    # Neyman-Pearson
                    Z[b] += r.roll(-token) * (1/ps[b,i] - 1)

        for b in range(B):
            if method in {'aaronson', 'kirchenbauer'}:
                m = torch.argmax(Z[b,:MAX])
            elif method in {'aaronson_simplified', 'aaronson_neyman_pearson'}:
                m = torch.argmin(Z[b,:MAX])

            i = int(m)
            S = Z[b, i].item()
            m = i

            # actual sequence length
            k = torch.sum(attention_masks[b]).item() - 1

            if method == 'aaronson':
                cdf = gammaincc(k, S)
            elif method == 'aaronson_simplified':
                cdf = gammainc(k, S)
            elif method == 'aaronson_neyman_pearson':
                # Chernoff bound
                ratio = ps[b,:k] / (1 - ps[b,:k])
                E = (1/ratio).sum()

                if S > E:
                    cdf = 1.0
                else:
                    # to compute p-value we must solve for c*:
                    # (1/(c* + ps/(1-ps))).sum() = S
                    func = lambda c : (((1 / (c + ratio)).sum() - S)**2).item()
                    c1 = (k / S - torch.min(ratio)).item()
                    print("max = ", c1)
                    c = fminbound(func, 0, c1)
                    print("solved c = ", c)
                    print("solved s = ", ((1/(c + ratio)).sum()).item())
                    # upper bound
                    cdf = torch.exp(torch.sum(-torch.log(1 + c / ratio)) + c * S)
            elif method == 'kirchenbauer':
                cdf = betainc(S, k - S + 1, gamma)

            if cdf > min(1 / MAX, 1e-5):
                cdf = 1 - (1 - cdf)**MAX # true value
            else:
                cdf = cdf * MAX # numerically stable upper bound
            cdfs.append(float(cdf))
            ms.append(m)

        return cdfs, ms