66
77class Tokenizer_Http :
88
9- def __init__ (self ):
10-
11- path = "internvl2_tokenizer"
9+ def __init__ (self , model_id ):
1210 self .tokenizer = AutoTokenizer .from_pretrained (
13- path , trust_remote_code = True , use_fast = False
11+ model_id , trust_remote_code = True , use_fast = False
1412 )
1513
16- def encode (self , content ):
17- prompt = f"<|im_start|>system\n 你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。 <|im_end|><|im_start|>user\n { content } <|im_end|><|im_start|>assistant\n "
14+ def encode (self , prompt , content ):
15+ prompt = f"<|im_start|>system\n { content } <|im_end|><|im_start|>user\n { prompt } <|im_end|><|im_start|>assistant\n "
1816 input_ids = self .tokenizer .encode (prompt )
1917 return input_ids
2018
21- def encode_vpm (self , content = "Please describe the image shortly." ):
22- prompt = f"<|im_start|>system\n 你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。 <|im_end|><|im_start|>user\n <img><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT></img>\n { content } <|im_end|><|im_start|>assistant\n "
19+ def encode_vpm (self , prompt , content = "Please describe the image shortly." ):
20+ prompt = f"<|im_start|>system\n { content } <|im_end|><|im_start|>user\n <img><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT><IMG_CONTEXT></img>\n { prompt } <|im_end|><|im_start|>assistant\n "
2321 input_ids = self .tokenizer .encode (prompt )
2422 return input_ids
2523
@@ -42,26 +40,6 @@ def bos_token(self):
4240 def eos_token (self ):
4341 return self .tokenizer .eos_token
4442
45-
46- tokenizer = Tokenizer_Http ()
47-
48- print (tokenizer .bos_id , tokenizer .bos_token , tokenizer .eos_id , tokenizer .eos_token )
49- token_ids = tokenizer .encode_vpm ()
50- # [151644, 8948, 198, 56568, 104625, 100633, 104455, 104800, 101101, 32022, 102022, 99602, 100013, 9370, 90286, 21287, 42140, 53772, 35243, 26288, 104949, 3837, 105205, 109641, 67916, 30698, 11, 54851, 46944, 115404, 42192, 99441, 100623, 48692, 100168, 110498, 1773, 151645, 151644, 872, 198,
51- # 151646,
52- # 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648,
53- # 151647,
54- # 198, 5501, 7512, 279, 2168, 19620, 13, 151645, 151644, 77091, 198]
55- # 118
56- print (token_ids )
57- print (len (token_ids ))
58- token_ids = tokenizer .encode ("hello world" )
59- # [151644, 8948, 198, 56568, 104625, 100633, 104455, 104800, 101101, 32022, 102022, 99602, 100013, 9370, 90286, 21287, 42140, 53772, 35243, 26288, 104949, 3837, 105205, 109641, 67916, 30698, 11, 54851, 46944, 115404, 42192, 99441, 100623, 48692, 100168, 110498, 1773, 151645, 151644, 872, 198, 14990, 1879, 151645, 151644, 77091, 198]
60- # 47
61- print (token_ids )
62- print (len (token_ids ))
63-
64-
6543class Request (BaseHTTPRequestHandler ):
6644 # 通过类继承,新定义类
6745 timeout = 5
@@ -117,7 +95,7 @@ def do_POST(self):
11795 if b_img_prompt :
11896 token_ids = tokenizer .encode_vpm (prompt )
11997 else :
120- token_ids = tokenizer .encode (prompt )
98+ token_ids = tokenizer .encode (prompt , args . content )
12199 if token_ids is None :
122100 msg = json .dumps ({"token_ids" : - 1 })
123101 else :
@@ -144,8 +122,16 @@ def do_POST(self):
144122 args = argparse .ArgumentParser ()
145123 args .add_argument ("--host" , type = str , default = "localhost" )
146124 args .add_argument ("--port" , type = int , default = 8080 )
125+ args .add_argument ('--model_id' , type = str , default = 'internvl2_tokenizer' )
126+ args .add_argument ('--content' , type = str , default = '你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。' )
147127 args = args .parse_args ()
148128
129+ tokenizer = Tokenizer_Http (args .model_id )
130+
131+
132+ # print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id, tokenizer.eos_token)
133+ # print(tokenizer.encode("hello world", args.content))
134+
149135 host = (args .host , args .port ) # 设定地址与端口号,'localhost'等价于'127.0.0.1'
150136 print ("http://%s:%s" % host )
151137 server = HTTPServer (host , Request ) # 根据地址端口号和新定义的类,创建服务器实例
0 commit comments