Skip to content

Commit

Permalink
Merge remote-tracking branch 'temp/concedo' into concedo_experimental
Browse files Browse the repository at this point in the history
# Conflicts:
#	koboldcpp.py
  • Loading branch information
LostRuins committed May 2, 2023
2 parents 58f25dc + 069b3d4 commit 966cd2c
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 49 deletions.
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,13 @@ zig-cache/
ppl-*.txt

examples/jeopardy/results.txt
koboldcpp.so
koboldcpp_noavx2.so
koboldcpp_openblas.so
koboldcpp_openblas_noavx2.so
koboldcpp_clblast.so
koboldcpp.dll
koboldcpp_noavx2.dll
koboldcpp_openblas.dll
koboldcpp_openblas_noavx2.dll
koboldcpp_clblast.dll
1 change: 1 addition & 0 deletions expose.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ struct load_model_inputs
const char * model_filename;
const char * lora_filename;
const bool use_mmap;
const bool use_mlock;
const bool use_smartcontext;
const bool unban_tokens;
const int clblast_info = 0;
Expand Down
2 changes: 1 addition & 1 deletion gpttype_adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
llama_ctx_params.f16_kv = inputs.f16_kv;
llama_ctx_params.logits_all = false;
llama_ctx_params.use_mmap = inputs.use_mmap;
llama_ctx_params.use_mlock = false;
llama_ctx_params.use_mlock = inputs.use_mlock;

llama_ctx_v1 = llama_init_from_file(modelname.c_str(), llama_ctx_params);

Expand Down
99 changes: 51 additions & 48 deletions koboldcpp.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# A hacky little script from Concedo that exposes llama.cpp function bindings
# A hacky little script from Concedo that exposes llama.cpp function bindings
# allowing it to be used via a simulated kobold api endpoint
# generation delay scales linearly with original prompt length.

Expand All @@ -18,6 +18,7 @@ class load_model_inputs(ctypes.Structure):
("model_filename", ctypes.c_char_p),
("lora_filename", ctypes.c_char_p),
("use_mmap", ctypes.c_bool),
("use_mlock", ctypes.c_bool),
("use_smartcontext", ctypes.c_bool),
("unban_tokens", ctypes.c_bool),
("clblast_info", ctypes.c_int),
Expand Down Expand Up @@ -50,7 +51,7 @@ def file_exists(filename):
return os.path.exists(os.path.join(getdirpath(), filename))

def pick_existant_file(ntoption,nonntoption):
ntexist = file_exists(ntoption)
ntexist = file_exists(ntoption)
nonntexist = file_exists(nonntoption)
if os.name == 'nt':
if nonntexist and not ntexist:
Expand All @@ -69,19 +70,19 @@ def pick_existant_file(ntoption,nonntoption):


def init_library():
global handle
global handle
global lib_default,lib_noavx2,lib_openblas,lib_openblas_noavx2,lib_clblast

libname = ""
use_blas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir.
use_clblast = False #uses CLBlast instead
use_noavx2 = False #uses openblas with no avx2 instructions

if args.noavx2:
use_noavx2 = True
if not file_exists(lib_openblas_noavx2) or (os.name=='nt' and not file_exists("libopenblas.dll")):
print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.")
elif args.noblas:
print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.")
elif args.noblas:
print("Attempting to use non-avx2 compatibility library without OpenBLAS.")
else:
use_blas = True
Expand All @@ -100,7 +101,7 @@ def init_library():
else:
use_blas = True
print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas will be required.")

if use_noavx2:
if use_blas:
libname = lib_openblas_noavx2
Expand All @@ -120,11 +121,11 @@ def init_library():
#OpenBLAS should provide about a 2x speedup on prompt ingestion if compatible.
handle = ctypes.CDLL(os.path.join(dir_path, libname))

handle.load_model.argtypes = [load_model_inputs]
handle.load_model.argtypes = [load_model_inputs]
handle.load_model.restype = ctypes.c_bool
handle.generate.argtypes = [generation_inputs, ctypes.c_wchar_p] #apparently needed for osx to work. i duno why they need to interpret it that way but whatever
handle.generate.restype = generation_outputs

def load_model(model_filename):
inputs = load_model_inputs()
inputs.model_filename = model_filename.encode("UTF-8")
Expand All @@ -134,6 +135,7 @@ def load_model(model_filename):
inputs.threads = args.threads
inputs.f16_kv = True
inputs.use_mmap = (not args.nommap)
inputs.use_mlock = args.usemlock
if args.lora and args.lora!="":
inputs.use_mmap = False
inputs.use_smartcontext = args.smartcontext
Expand Down Expand Up @@ -212,31 +214,31 @@ def do_GET(self):
self.end_headers()
print("Force redirect to streaming mode, as --stream is set.")
return None

if self.embedded_kailite is None:
response_body = (f"Embedded Kobold Lite is not found.<br>You will have to connect via the main KoboldAI client, or <a href='https://lite.koboldai.net?local=1&port={self.port}'>use this URL</a> to connect.").encode()
else:
response_body = self.embedded_kailite

elif self.path.endswith(('/api/v1/model', '/api/latest/model')):
response_body = (json.dumps({'result': friendlymodelname }).encode())
response_body = (json.dumps({'result': friendlymodelname }).encode())

elif self.path.endswith(('/api/v1/config/max_length', '/api/latest/config/max_length')):
elif self.path.endswith(('/api/v1/config/max_length', '/api/latest/config/max_length')):
response_body = (json.dumps({"value": maxlen}).encode())

elif self.path.endswith(('/api/v1/config/max_context_length', '/api/latest/config/max_context_length')):
response_body = (json.dumps({"value": maxctx}).encode())
elif self.path.endswith(('/api/v1/config/soft_prompt', '/api/latest/config/soft_prompt')):

elif self.path.endswith(('/api/v1/config/soft_prompt', '/api/latest/config/soft_prompt')):
response_body = (json.dumps({"value":""}).encode())

elif self.path.endswith(('/api/v1/config/soft_prompts_list', '/api/latest/config/soft_prompts_list')):
response_body = (json.dumps({"values": []}).encode())

elif self.path.endswith(('/api/v1/info/version', '/api/latest/info/version')):
response_body = (json.dumps({"result":"1.2.2"}).encode())
elif self.path.endswith(('/api/extra/version')):

elif self.path.endswith(('/api/extra/version')):
response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion}).encode())

if response_body is None:
Expand All @@ -250,11 +252,11 @@ def do_GET(self):
self.end_headers()
self.wfile.write(response_body)
return

def do_POST(self):
global modelbusy
content_length = int(self.headers['Content-Length'])
body = self.rfile.read(content_length)
body = self.rfile.read(content_length)
basic_api_flag = False
kai_api_flag = False
self.path = self.path.rstrip('/')
Expand All @@ -281,16 +283,16 @@ def do_POST(self):
except ValueError as e:
self.send_response(503)
self.end_headers()
return
return
print("\nInput: " + json.dumps(genparams))

modelbusy = True
if kai_api_flag:
fullprompt = genparams.get('prompt', "")
else:
fullprompt = genparams.get('text', "")
newprompt = fullprompt

recvtxt = ""
res = {}
if kai_api_flag:
Expand All @@ -309,7 +311,7 @@ def do_POST(self):
stop_sequence=genparams.get('stop_sequence', [])
)
print("\nOutput: " + recvtxt)
res = {"results": [{"text": recvtxt}]}
res = {"results": [{"text": recvtxt}]}
else:
recvtxt = generate(
prompt=newprompt,
Expand All @@ -327,21 +329,21 @@ def do_POST(self):
print("\nOutput: " + recvtxt)
res = {"data": {"seqs":[recvtxt]}}

try:
try:
self.send_response(200)
self.end_headers()
self.wfile.write(json.dumps(res).encode())
except:
print("Generate: The response could not be sent, maybe connection was terminated?")
modelbusy = False
return
return
self.send_response(404)
self.end_headers()

def do_OPTIONS(self):
self.send_response(200)
self.end_headers()

def do_HEAD(self):
self.send_response(200)
self.end_headers()
Expand All @@ -354,7 +356,7 @@ def end_headers(self):
self.send_header('Content-type', 'application/json')
else:
self.send_header('Content-type', 'text/html')

return super(ServerRequestHandler, self).end_headers()


Expand Down Expand Up @@ -418,15 +420,15 @@ def guilaunch():
# Adjust size
root.geometry("460x320")
root.title("KoboldCpp v"+KcppVersion)
tk.Label(root, text = "KoboldCpp Easy Launcher",
font = ("Arial", 12)).pack(pady=4)
tk.Label(root, text = "KoboldCpp Easy Launcher",
font = ("Arial", 12)).pack(pady=4)
tk.Label(root, text = "(Note: KoboldCpp only works with GGML model formats!)",
font = ("Arial", 9)).pack()


opts = ["Use OpenBLAS","Use CLBLast GPU #1","Use CLBLast GPU #2","Use CLBLast GPU #3","Use No BLAS","Use OpenBLAS (Old Devices)","Use No BLAS (Old Devices)"]
runchoice = tk.StringVar()
runchoice.set("Use OpenBLAS")
runchoice.set("Use OpenBLAS")
tk.OptionMenu( root , runchoice , *opts ).pack()

stream = tk.IntVar()
Expand All @@ -437,26 +439,26 @@ def guilaunch():
tk.Checkbutton(root, text='Use SmartContext',variable=smartcontext, onvalue=1, offvalue=0).pack()
tk.Checkbutton(root, text='Unban Tokens',variable=unbantokens, onvalue=1, offvalue=0).pack()
tk.Checkbutton(root, text='Launch Browser',variable=launchbrowser, onvalue=1, offvalue=0).pack()

# Create button, it will change label text
tk.Button( root , text = "Launch", font = ("Impact", 18), bg='#54FA9B', command = guilaunch ).pack(pady=10)
tk.Button( root , text = "Launch", font = ("Impact", 18), bg='#54FA9B', command = guilaunch ).pack(pady=10)
tk.Label(root, text = "(Please use the Command Line for more advanced options)",
font = ("Arial", 9)).pack()
font = ("Arial", 9)).pack()

root.mainloop()

if launchclicked==False:
print("Exiting by user request.")
time.sleep(2)
sys.exit()

#load all the vars
args.stream = (stream.get()==1)
args.smartcontext = (smartcontext.get()==1)
args.launch = (launchbrowser.get()==1)
args.unbantokens = (unbantokens.get()==1)
selchoice = runchoice.get()

if selchoice==opts[1]:
args.useclblast = [0,0]
if selchoice==opts[2]:
Expand All @@ -470,11 +472,11 @@ def guilaunch():
if selchoice==opts[6]:
args.nonoavx2 = True
args.noblas = True

root = tk.Tk()
root.attributes("-alpha", 0)
args.model_param = askopenfilename(title="Select ggml model .bin files")
root.destroy()
root.destroy()
if not args.model_param:
print("\nNo ggml model file was selected. Exiting.")
time.sleep(2)
Expand All @@ -491,11 +493,11 @@ def guilaunch():
sys.exit(2)

def main(args):
embedded_kailite = None

embedded_kailite = None
if not args.model_param:
args.model_param = args.model
if not args.model_param:
if not args.model_param:
#give them a chance to pick a file
print("For command line arguments, please refer to --help")
print("Otherwise, please manually select ggml file:")
Expand Down Expand Up @@ -525,7 +527,7 @@ def main(args):
except Exception as ex:
print("Error, Could not change process priority: " + str(ex))

init_library() # Note: if blas does not exist and is enabled, program will crash.
init_library() # Note: if blas does not exist and is enabled, program will crash.
print("==========")
time.sleep(1)
if not os.path.exists(args.model_param):
Expand Down Expand Up @@ -568,10 +570,10 @@ def main(args):
print(f"Starting Kobold HTTP Server on port {args.port}")
epurl = ""
if args.host=="":
epurl = f"http://localhost:{args.port}"
epurl = f"http://localhost:{args.port}"
else:
epurl = f"http://{args.host}:{args.port}"
epurl = f"http://{args.host}:{args.port}"

if args.launch:
try:
import webbrowser as wb
Expand All @@ -596,7 +598,7 @@ def main(args):

#os.environ["OMP_NUM_THREADS"] = '12'
# psutil.cpu_count(logical=False)
physical_core_limit = 1
physical_core_limit = 1
if os.cpu_count()!=None and os.cpu_count()>1:
physical_core_limit = int(os.cpu_count()/2)
default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
Expand All @@ -608,6 +610,7 @@ def main(args):
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents certain tokens such as EOS and Square Brackets. This flag unbans them.", action='store_true')
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
parser.add_argument("--usemlock", help="For Apple Systems. Force system to keep model in RAM rather than swapping or compressing", action='store_true')
parser.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices. Does not work with --clblast.", action='store_true')
parser.add_argument("--debugmode", help="Shows additional debug info in the terminal.", action='store_true')
parser.add_argument("--skiplauncher", help="Doesn't display or use the new GUI launcher.", action='store_true')
Expand Down

0 comments on commit 966cd2c

Please sign in to comment.