from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import torch import threading model_name = "microsoft/phi-2" device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16 if device == "cuda" else torch.float32, low_cpu_mem_usage=True ).to(device) system_prompt = ( "You are ProTalk, a professional and intelligent AI. " "You answer clearly, politely, and with insight. " "Be professional, witty, and helpful in all responses." ) def chat_loop(): history = [] print("ProTalk Online — type 'exit' to quit.\n") while True: user_input = input("User: ") if user_input.lower() == "exit": break prompt = system_prompt + "\n" + "\n".join(history) + f"\nUser: {user_input}\nProTalk:" inputs = tokenizer(prompt, return_tensors="pt").to(device) streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True) thread = threading.Thread(target=model.generate, kwargs={ "input_ids": inputs["input_ids"], "max_new_tokens": 200, "do_sample": True, "temperature": 0.7, "top_p": 0.9, "streamer": streamer }) thread.start() output_text = "" for token in streamer: print(token, end="", flush=True) output_text += token thread.join() print() history.append(f"User: {user_input}") history.append(f"ProTalk: {output_text}") if __name__ == "__main__": chat_loop()