Instructions to use apple/OpenELM-270M-Instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use apple/OpenELM-270M-Instruct with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="apple/OpenELM-270M-Instruct", trust_remote_code=True)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("apple/OpenELM-270M-Instruct", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use apple/OpenELM-270M-Instruct with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "apple/OpenELM-270M-Instruct"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "apple/OpenELM-270M-Instruct",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/apple/OpenELM-270M-Instruct

SGLang

How to use apple/OpenELM-270M-Instruct with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "apple/OpenELM-270M-Instruct" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "apple/OpenELM-270M-Instruct",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "apple/OpenELM-270M-Instruct" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "apple/OpenELM-270M-Instruct",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use apple/OpenELM-270M-Instruct with Docker Model Runner:
```
docker model run hf.co/apple/OpenELM-270M-Instruct
```

OpenELM-270M-Instruct / generate_openelm.py

qicao-apple

add OpenELM-270M-Instruct

c401df2 about 2 years ago

raw

history blame contribute delete

7.45 kB

	#
	# For licensing see accompanying LICENSE file.
	# Copyright (C) 2024 Apple Inc. All Rights Reserved.
	#

	"""Module to generate OpenELM output given a model and an input prompt."""
	import os
	import logging
	import time
	import argparse
	from typing import Optional, Union
	import torch

	from transformers import AutoTokenizer, AutoModelForCausalLM


	def generate(
	prompt: str,
	model: Union[str, AutoModelForCausalLM],
	hf_access_token: str = None,
	tokenizer: Union[str, AutoTokenizer] = 'meta-llama/Llama-2-7b-hf',
	device: Optional[str] = None,
	max_length: int = 1024,
	assistant_model: Optional[Union[str, AutoModelForCausalLM]] = None,
	generate_kwargs: Optional[dict] = None,
	) -> str:
	""" Generates output given a prompt.

	Args:
	prompt: The string prompt.
	model: The LLM Model. If a string is passed, it should be the path to
	the hf converted checkpoint.
	hf_access_token: Hugging face access token.
	tokenizer: Tokenizer instance. If model is set as a string path,
	the tokenizer will be loaded from the checkpoint.
	device: String representation of device to run the model on. If None
	and cuda available it would be set to cuda:0 else cpu.
	max_length: Maximum length of tokens, input prompt + generated tokens.
	assistant_model: If set, this model will be used for
	speculative generation. If a string is passed, it should be the
	path to the hf converted checkpoint.
	generate_kwargs: Extra kwargs passed to the hf generate function.

	Returns:
	output_text: output generated as a string.
	generation_time: generation time in seconds.

	Raises:
	ValueError: If device is set to CUDA but no CUDA device is detected.
	ValueError: If tokenizer is not set.
	ValueError: If hf_access_token is not specified.
	"""
	if not device:
	if torch.cuda.is_available() and torch.cuda.device_count():
	device = "cuda:0"
	logging.warning(
	'inference device is not set, using cuda:0, %s',
	torch.cuda.get_device_name(0)
	)
	else:
	device = 'cpu'
	logging.warning(
	(
	'No CUDA device detected, using cpu, '
	'expect slower speeds.'
	)
	)

	if 'cuda' in device and not torch.cuda.is_available():
	raise ValueError('CUDA device requested but no CUDA device detected.')

	if not tokenizer:
	raise ValueError('Tokenizer is not set in the generate function.')

	if not hf_access_token:
	raise ValueError((
	'Hugging face access token needs to be specified. '
	'Please refer to https://huggingface.co/docs/hub/security-tokens'
	' to obtain one.'
	)
	)

	if isinstance(model, str):
	checkpoint_path = model
	model = AutoModelForCausalLM.from_pretrained(
	checkpoint_path,
	trust_remote_code=True
	)
	model.to(device).eval()
	if isinstance(tokenizer, str):
	tokenizer = AutoTokenizer.from_pretrained(
	tokenizer,
	token=hf_access_token,
	)

	# Speculative mode
	draft_model = None
	if assistant_model:
	draft_model = assistant_model
	if isinstance(assistant_model, str):
	draft_model = AutoModelForCausalLM.from_pretrained(
	assistant_model,
	trust_remote_code=True
	)
	draft_model.to(device).eval()

	# Prepare the prompt
	tokenized_prompt = tokenizer(prompt)
	tokenized_prompt = torch.tensor(
	tokenized_prompt['input_ids'],
	device=device
	)

	tokenized_prompt = tokenized_prompt.unsqueeze(0)

	# Generate
	stime = time.time()
	output_ids = model.generate(
	tokenized_prompt,
	max_length=max_length,
	pad_token_id=0,
	assistant_model=draft_model,
	**(generate_kwargs if generate_kwargs else {}),
	)
	generation_time = time.time() - stime

	output_text = tokenizer.decode(
	output_ids[0].tolist(),
	skip_special_tokens=True
	)

	return output_text, generation_time


	def openelm_generate_parser():
	"""Argument Parser"""

	class KwargsParser(argparse.Action):
	"""Parser action class to parse kwargs of form key=value"""
	def __call__(self, parser, namespace, values, option_string=None):
	setattr(namespace, self.dest, dict())
	for val in values:
	if '=' not in val:
	raise ValueError(
	(
	'Argument parsing error, kwargs are expected in'
	' the form of key=value.'
	)
	)
	kwarg_k, kwarg_v = val.split('=')
	try:
	converted_v = int(kwarg_v)
	except ValueError:
	try:
	converted_v = float(kwarg_v)
	except ValueError:
	converted_v = kwarg_v
	getattr(namespace, self.dest)[kwarg_k] = converted_v

	parser = argparse.ArgumentParser('OpenELM Generate Module')
	parser.add_argument(
	'--model',
	dest='model',
	help='Path to the hf converted model.',
	required=True,
	type=str,
	)
	parser.add_argument(
	'--hf_access_token',
	dest='hf_access_token',
	help='Hugging face access token, starting with "hf_".',
	type=str,
	)
	parser.add_argument(
	'--prompt',
	dest='prompt',
	help='Prompt for LLM call.',
	default='',
	type=str,
	)
	parser.add_argument(
	'--device',
	dest='device',
	help='Device used for inference.',
	type=str,
	)
	parser.add_argument(
	'--max_length',
	dest='max_length',
	help='Maximum length of tokens.',
	default=256,
	type=int,
	)
	parser.add_argument(
	'--assistant_model',
	dest='assistant_model',
	help=(
	(
	'If set, this is used as a draft model '
	'for assisted speculative generation.'
	)
	),
	type=str,
	)
	parser.add_argument(
	'--generate_kwargs',
	dest='generate_kwargs',
	help='Additional kwargs passed to the HF generate function.',
	type=str,
	nargs='*',
	action=KwargsParser,
	)
	return parser.parse_args()


	if __name__ == '__main__':
	args = openelm_generate_parser()
	prompt = args.prompt

	output_text, genertaion_time = generate(
	prompt=prompt,
	model=args.model,
	device=args.device,
	max_length=args.max_length,
	assistant_model=args.assistant_model,
	generate_kwargs=args.generate_kwargs,
	hf_access_token=args.hf_access_token,
	)

	print_txt = (
	f'\r\n{"=" * os.get_terminal_size().columns}\r\n'
	'\033[1m Prompt + Generated Output\033[0m\r\n'
	f'{"-" * os.get_terminal_size().columns}\r\n'
	f'{output_text}\r\n'
	f'{"-" * os.get_terminal_size().columns}\r\n'
	'\r\nGeneration took'
	f'\033[1m\033[92m {round(genertaion_time, 2)} \033[0m'
	'seconds.\r\n'
	)
	print(print_txt)