i tried to create for human bot within a single column message text but still getting error:
%%writefile sft_dataloader.py
def format_prompt(prompt: str) -> str:
text = f"""
### Human: {prompt}
### Assistant:
"""
return text.strip()
class SFTDataLoader(object):
def __init__(self, data, CUTOFF_LEN, VAL_SET_SIZE, tokenizer) -> None:
super(SFTDataLoader, self).__init__()
self.data = data
self.CUTOFF_LEN = CUTOFF_LEN
self.VAL_SET_SIZE = VAL_SET_SIZE
self.tokenizer = tokenizer
def generate_prompt(self, data_point):
return format_prompt(data_point["message_tree_text"])
def tokenize(self, prompt):
# there's probably a way to do this with the tokenizer settings
# but again, gotta move fast
return self.tokenizer(
prompt,
truncation=True,
max_length=self.CUTOFF_LEN + 1,
padding="max_length",
return_unused_tokens=True,
)
def generate_and_tokenize_prompt(self, data_point):
# This function masks out the labels for the input,
# so that our loss is computed only on the response.
user_prompt = format_prompt(data_point["message_tree_text"])
len_user_prompt_tokens = len(
self.tokenizer(
user_prompt,
truncation=True,
max_length=self.CUTOFF_LEN + 1,
padding="max_length",
return_unused_tokens=True,
)["input_ids"]
)
full_tokens = self.tokenizer(
user_prompt,
truncation=True,
max_length=self.CUTOFF_LEN + 1,
padding="max_length",
return_unused_tokens=True,
)["input_ids"]
return {
"input_ids": full_tokens[: len_user_prompt_tokens],
"labels": full_tokens[len_user_prompt_tokens:],
"attention_mask": [1] * len(full_tokens),
}
def load_data(self):
train_val = formatted_dataset.train_test_split(
test_size=self.VAL_SET_SIZE, shuffle=True, seed=42
)
train_data = train_val["train"].shuffle().map(
self.generate_and_tokenize_prompt
)
val_data = train_val["test"].shuffle().map(
self.generate_and_tokenize_prompt
)
return train_data, val_data