��/��/��ţ

��

��˹�˿�Դ�� grok-1 ��ģ�ͶԱ� openai chatGPT Դ��Ӳ�˵�һ��

��Դ�� - ��Х��

��չ�Ķ�

ǰ��

��ϵĴ󲿷��ݶ��ǳ��ֹ��ʹ��һ��򵥿�һ��˹��쿪Դ�� grok ��ʲô��ݡ�

Grok-1

�ô洢��˼��غ��Grok-1��Ȩ��ģ�͵�JAXʾ��롣

��ȷ��ؼ��㲢��ckpt-0Ŀ¼��checkpoints�� - �μ��Ȩ��[1]��

Ȼ��

pip install -r requirements.txtpython run.py

�Բ��Դ��롣

�ýű��ؼ��㲢��ģ�ͶԲ��в��

��ģ�͹�ģ�ϴ�314B��ģ��GPU�ڴ��㹻�󡣸ô洢��MoE��ʵ�ֲ��Ч��ѡ��˴�ʵ��Ա��Ҫ�Զ��ں��֤ģ�͵��ȷ�ԡ�

ģ�͹��

Grok-1��ǰ��ƾ��¹��

?��:314B?�ܹ�:8��ר�ҵĻ�ϣ�MoE��?ר��:ÿ��ʹ��2��ר��?��:64?ע��ͷ:��ѯʹ��48��/ֵʹ��8��?Ƕ��С:6,144?��ǻ�:ʹ��131,072��ǵ�SentencePiece�ִ��?��ӹ��:

?��תǶ�루RoPE��?֧�ּ��Ƭ��8λ��

?��г��ȣ��ģ�:8,192��

��Ȩ��

��ʹ��ӿͻ��˺��´��Ȩ�أ�

��ֱ��ʹ��HuggingFace Hub[2]��

git clone https://github.com/xai-org/grok-1.git && cd grok-1pip install huggingface_hub[hf_transfer]huggingface-cli download xai-org/grok-1 --repo-type model --include ckpt-0/* --local-dir checkpoints --local-dir-use-symlinks False

��֤

�˰汾�еĴ��Grok-1Ȩ��Apache 2.0��֤��Լ��

��֤��ڴ˴洢��е�Դ�ļ��Grok-1��ģ��Ȩ�ء�

��

��ȿ� 2 ƪ�Ƚϼ򵥵ġ�

run.py ��

# ��룺��Х��# ��Ҫ�Ŀ��ģ��import logging # ��־��¼ģ��from model import LanguageModelConfig, TransformerConfig, QuantizedWeight8bit as QW8Bit # ��ģ��е��ģ��Ȩ��from runners import InferenceRunner, ModelRunner, sample_from_model # ��ģ��е��ģ��ʹ�ģ��в��

# ָ��ģ�ͼ��·��CKPT_PATH = "./checkpoints/"

# ��def main(): # ��Grok-1ģ�Ͳ�� grok_1_model = LanguageModelConfig( vocab_size=128 * 1024, # �ʻ��С pad_token=0, # �� eos_token=2, # �� sequence_len=8192, # ��г�� embedding_init_scale=1.0, # Ƕ��ʼ�� output_multiplier_scale=0.5773502691896257, # �� embedding_multiplier_scale=78.38367176906169, # Ƕ�뱶�� model=TransformerConfig( emb_size=48 * 128, # Ƕ��С widening_factor=8, # ��չ�� key_size=128, # �ؼ��ִ�С num_q_heads=48, # ��ѯע��ͷ�� num_kv_heads=8, # ��/ֵע��ͷ�� num_layers=64, # �� attn_output_multiplier=0.08838834764831845, # ע�� shard_activations=True, # ��Ƭ num_experts=8, # MoEר�� num_selected_experts=2, # ÿ��ѡȡ��ר�� data_axis="data", # �� model_axis="model", # ģ�� ), )

# �� inference_runner = InferenceRunner( pad_sizes=(1024,), # ��С runner=ModelRunner( model=grok_1_model, # ָ��ģ�� bs_per_device=0.125, # ÿ��豸��С checkpoint_path=CKPT_PATH, # ��·�� ), name="local", # �� load=CKPT_PATH, # ��ؼ��·�� tokenizer_path="./tokenizer.model", # �ִ��·�� local_mesh_config=(1, 8), # �� between_hosts_config=(1, 1), # ��֮�� )

# ��ʼ�� inference_runner.initialize()

# ִ�� gen = inference_runner.run()

# ��ַ�� inp = "The answer to life the universe and everything is of course"

# �� print(f"Output for prompt: {inp}", sample_from_model(gen, inp, max_len=100, temperature=0.01))

# ��Ƿ��if __name__ == "__main__": # ��־��¼��ã��־��ΪINFO logging.basicConfig(level=logging.INFO) # �� main()

checkpoint.py

��ʲô

��δ��Ҫ��¹��ܣ�

1.��ģ��Ϳ⡣2.��һϵ��ڹ��ڴ��и��ļ��Ĺ��͸��ļ��Ч�ʲ��ʡ��Դ��3.ʵ��˿��ٵķ��л��л��ʹ��Ĺ��Ա��ڹ��ڴ��н��ļ��Ʋ��4.�ṩ�˼��ݵĺ��load_tensors��ú��֧�ֶ��̲߳��м��ݣ��ڼ��ع��ж��ݽ��з�Ƭ�ʹ��5.��һЩ��ڴ��·��ַ��ͼ��·��Լ��״̬�ָ��н��״̬һ��Լ��Ͳ��ѡ���6.ʵ��״̬�ָ��restore��ú��Ӽ��ļ��м��ݣ��ָ�Ϊ״̬��ͬʱ��״̬һ��Լ��Ͳ��ѡ���

��ĵ�Ͳ��

?�ṩ�˸�Ч��ļ��ܣ��˹��ڴ�Ͷ��̼߳��?֧��˿��ٵķ��л��л��̡�?ʵ��˲��м��ݵĹ��ܣ��˼��Ч�ʡ�?��״̬�ָ��У��֤�˼��ص�״̬��е�״̬��һ��ԣ��ݲ��ѡ��Ĵ��

1.replace_with_load_state��ú��ڸ��ݼ��״̬�滻��ʼ״̬�е��ݡ�

��ĺ��Ĳ��

?չƽ��״̬�ͳ�ʼ״̬��Լ��ȡ��״̬��ݵ�·��ӳ�䡣?��ʼ״̬�е��ݣ��ݼ��·��滻�򴴽��µ��?��滻��װ��ṹ��ء�

1.restore��ú��ڴӼ��ļ��м��ݲ��ָ�Ϊ״̬��Ĳ��

?��ļ�·��ӡ��ؼ��Ϣ��?��״̬��״��Ϣ��ݡ�?�Լ��ص�״̬��һ��Լ�飬ȷ��һ��ԡ�?��״̬��Ƭӳ��Ϊȫ��飬��ݲ��ѡ��Ƿ��ز��֡�

��Щ��ܵ�ʵ�ֻ�� JAX ��ṩ��ṹ��Ͳ��м��㹦�ܣ�ͬʱ��Ĺ��Ͷ��̼߳��Ч�ش��ļ��Ͳ��

Դ��

ע��Դ��

from __future__ import annotations# �� Python δ��֧�ֵ��ģ�飬��֧��ʾ��ʹ��ַ��ʽ��

import contextlib # ��Ĺ��ģ�飬��ڴ��Ĺ��import logging # ��־��¼ģ��import math # ��ѧģ��import os # ��ϵͳģ�飬��ڴ��ļ��Ŀ¼·��import pickle # pickle��л�ģ�飬��ڶ��л��ͷ��л�import re # ��ʽģ�飬��ַ��ƥ��import shutil # �ļ��ģ�飬��ļ��ĸ��ơ��ƶ��Ȳ��import sys # ϵͳģ�飬�ṩ�� Python ��ķ��import tempfile # ��ʱ�ļ�ģ�飬��ڴ��ʱ�ļ��Ŀ¼from concurrent.futures import ThreadPoolExecutor, wait # ��ģ�飬��ڲ��ִ��from typing import Any, Optional # ��ʾģ�飬��ָ��ͷ��ֵ��

import jax # JAX ��ֵ��import numpy as np # ��鴦��ģ��

from jax.experimental import multihost_utils # JAX ʵ��ģ�飬��ڶ��

from model import QuantizedWeight8bit # ��Զ��ģ��е�� QuantizedWeight8bit ��

logger = logging.getLogger(__name__) # ��ȡ��ǰģ��־��¼��rank_logger = logging.getLogger("rank") # ��ȡ��־��¼��Ϣ

# ��涨��˼��Ĺ��ڽ��ļ��Ƶ��ڴ��С��ӹ��ڴ��и��ļ�# ��Щ��Ĺ��ȷ��ļ��ڸ��ƺ�ɾ��Խ�ʡ��Դ�Ϳռ�

@contextlib.contextmanagerdef copy_to_shm(file: str): if file.startswith("/dev/shm/"): # ��ļ��Ѿ��ڹ��ڴ��У��ֱ�ӷ��ļ�·�� yield file return

tmp_dir = "/dev/shm/" # ��ʱ�ļ��ԭʼ�ļ��ݵ��ʱ�ļ�� fd, tmp_path = tempfile.mkstemp(dir=tmp_dir) try: shutil.copyfile(file, tmp_path) yield tmp_path # ͨ�� yield ��ʱ�ļ�·�� finally: os.remove(tmp_path) # ɾ��ʱ�ļ� os.close(fd) # �ر��ļ��

@contextlib.contextmanagerdef copy_from_shm(file: str): tmp_dir = "/dev/shm/" fd, tmp_path = tempfile.mkstemp(dir=tmp_dir) try: yield tmp_path # ͨ�� yield ��ʱ�ļ�·�� shutil.copyfile(tmp_path, file) # ��ʱ�ļ��ݸ��Ƶ�Ŀ��ļ�� finally: os.remove(tmp_path) # ɾ��ʱ�ļ� os.close(fd) # �ر��ļ��

# ��ֱ��ڿ��ٷ��л��л��ʹ��涨��Ĺ��def fast_unpickle(path: str) -> Any: with copy_to_shm(path) as tmp_path: with open(tmp_path, "rb") as f: return pickle.load(f)

def fast_pickle(obj: Any, path: str) -> None: with copy_from_shm(path) as tmp_path: with open(tmp_path, "wb") as f: pickle.dump(obj, f)

# ��ĺ��ڼ��ݣ��ڶ��̻߳��²��м��def load_tensors(shaped_arrays, directory, mesh_config, tensor_indices=None): # ��һ��߳��Ϊ32��̳߳� pool = ThreadPoolExecutor(max_workers=32) fs = list() num_tensors = 0 num_replicas = 1 data_model_shards = math.prod(mesh_config) if tensor_indices is None: iterator = enumerate(shaped_arrays) else: iterator = zip(tensor_indices, shaped_arrays) for i, t in iterator: # ��ݵ�ǰ��̵��Ƿ�� if (i % num_replicas) == ((jax.process_index() // data_model_shards) % num_replicas): idx = ( jax.process_index() // (num_replicas * data_model_shards) * data_model_shards + jax.process_index() % data_model_shards ) # �ύ�첽��񣬼�� fs.append( pool.submit(fast_unpickle, os.path.join(directory, f"tensor{i:05d}_{idx:03d}")) ) num_tensors += 1 else: # ��ǰ��̲��Ҫ��ݣ��򴴽�һ�� fs.append(pool.submit(np.zeros, t.shape, dtype=t.dtype)) wait(fs) # �ȴ��첽�� return [f.result() for f in fs] # ��ؼ��ɵ��б�

# ��ĺ��ڽ�Ԫ��ʽ��·��ת��Ϊ�ַ��ʽdef path_tuple_to_string(path: tuple) -> str: pieces = [] for elem in path: if isinstance(elem, jax.tree_util.DictKey): pieces.append(elem.key) elif isinstance(elem, jax.tree_util.GetAttrKey): pieces.append(elem.name) else: assert isinstance(elem, (jax.tree_util.FlattenedIndexKey, jax.tree_util.SequenceKey)) return "/".join(pieces)

# ��ĺ��ڸ��ݹ��ȡ��·��ַ��def get_load_path_str( init_path_str: str, load_rename_rules: Optional[list[tuple[str, str]]] = None, load_exclude_rules: Optional[list[str]] = None,) -> Optional[str]: # �ų�� if load_exclude_rules is not None: for search_pattern in load_exclude_rules: if re.search(search_pattern, init_path_str): return None

# �� load_path_str = init_path_str if load_rename_rules is not None: for search_pattern, replacement_pattern in load_rename_rules: if re.search(search_pattern, load_path_str): load_path_str = re.sub(search_pattern, replacement_pattern, load_path_str) break

return load_path_str

# ��ĺ��滻��ʼ״̬�е��Ϊ��״̬�е��def replace_with_load_state( init_state: Any, # ��ʼ״̬�� load_state: Any, # ��״̬�� load_rename_rules: Optional[list[tuple[str, str]]] = None, # ��򣬿�ѡ��Ĭ��Ϊ None load_exclude_rules: Optional[list[str]] = None, # ��ų��򣬿�ѡ��Ĭ��Ϊ None mesh_config: tuple = (1, 1), # ��ã�Ԫ��ͣ�Ĭ��Ϊ (1, 1)) -> Any: # ��ֵΪ�� # չƽ��״̬��ȡ��״̬�е��ݺ�·�� flatten_load, _ = jax.tree_util.tree_flatten_with_path(load_state) # չƽ��ʼ״̬��ȡ��ʼ״̬�е��ݺ�·��Լ��ʼ״̬�Ľṹ��Ϣ flatten_init, structure_init = jax.tree_util.tree_flatten_with_path(init_state) # ��״̬��ݵ�·��ӳ�� load_map = {path_tuple_to_string(path): tensor for path, tensor in flatten_load}

replaced = [] # ��ڴ洢�滻��б� num_replicas = 1 # �� data_model_shards = math.prod(mesh_config) # ��ģ�ͷ�Ƭ�� # ��ʼ״̬�е��ݺ�·�� for i, (init_path, tensor) in enumerate(flatten_init): init_path_str = path_tuple_to_string(init_path) # ��ȡ��ʼ״̬��ݵ�·��ַ�� # ��ݼ��·��ȡ��·��ַ�� load_path_str = get_load_path_str(init_path_str, load_rename_rules, load_exclude_rules) if load_path_str is None: # ��·��ַ��Ϊ None��ų��滻 rank_logger.info(f"Excluded from restore: {init_path_str}.") replaced.append(tensor) elif load_path_str in load_map: # ��·��ַ��ڼ��·��ӳ��д��ڣ��滻 if load_path_str == init_path_str: rank_logger.info(f"Restored from ckpt: {init_path_str}.") else: rank_logger.info(f"Restored from ckpt: {init_path_str} <-- {load_path_str}.") replaced.append(load_map[load_path_str]) else: # ��·��ַ��ڼ��·��ӳ��в��ڣ��ݹ��򴴽�� rank_logger.info(f"Not found in ckpt: {init_path_str}.") if (i % num_replicas) == ((jax.process_index() // data_model_shards) % num_replicas): replaced.append(tensor) else: replaced.append(np.zeros_like(tensor))

return jax.tree_util.tree_unflatten(structure_init, replaced) # ��滻��װ��ṹ��

def restore( checkpoint_path: str, # ��·��ַ�� state_shapes: Any, # ״̬��״�� mesh, # �� between_hosts_config, # �� params_only, # �Ƿ�ֻ��ز�� state_sharding, # ״̬��Ƭ init_state: Optional[Any] = None, # ��ʼ״̬��ѡ��Ĭ��Ϊ None) -> Any: # ��ֵΪ�� ckpt_path = os.path.join(checkpoint_path, "ckpt-0") # ��ļ�·��

rank_logger.info("Loading checkpoint at {}".format(ckpt_path)) # ��ӡ��ؼ��Ϣ ckpt_shapes = state_shapes # ��ȡ״̬��״��Ϣ # չƽ״̬��״��ȡ״̬��״�е��״��·��Լ�״̬��״�Ľṹ��Ϣ ckpt_shapes_with_path, structure = jax.tree_util.tree_flatten_with_path(ckpt_shapes)

ckpt_shapes_flat = [elem[1] for elem in ckpt_shapes_with_path] # ��ȡ״̬��״�е��״�б� # �� loaded_tensors = load_tensors(ckpt_shapes_flat, ckpt_path, between_hosts_config)

state = jax.tree_util.tree_unflatten(structure, loaded_tensors) # ��ص��װ��״̬

# ��״̬��һ��Լ�飬ȷ��һ�� ckpt_keys = set(state.params.keys()) code_keys = set(state_sharding.params.keys())

if ckpt_keys != code_keys and init_state is None: # ��һ��ҳ�ʼ״̬Ϊ�գ��׳��쳣 missing_in_ckpt = code_keys - ckpt_keys missing_locally = ckpt_keys - code_keys raise ValueError( "Parameters in the code are not matching checkpoint parameters.\n" "Params missing in checkpoint: {}\nParams missing in code: {}".format( missing_in_ckpt, missing_locally ) ) # ��״̬��Ƭӳ��Ϊȫ�� state_sharding = jax.tree_util.tree_map( lambda x: jax.sharding.PartitionSpec() if x is None else x, state_sharding, is_leaf=lambda x: x is None, ) state = multihost_utils.host_local_array_to_global_array(state, mesh, state_sharding) # ��ת��Ϊȫ�� if params_only: state = state.params # ��ز��򷵻�״̬�Ĳ�� return state # ��״̬

References

[1]��Ȩ��:#downloading-the-weights

[2]HuggingFace Hub:https://huggingface.co/xai-org/grok-1

��: 2024-03-242024-03-24 09:27:00
ԭ��https://page.om.qq.com/page/OoKVFxW1xCTmxoiFs0pR4YTw0
��Ѷ��Ѷ�ƿ��Ѷ��ݿ��ƽ̨�ʺţ��ţ��֮һ��Ѷ��ݿ��ƽ̨��Э�顷ת�ط��ݡ�
��Ȩ��ϵ cloudcommunity@tencent.com ɾ��

��Ѷ

ɨ��

��վ�� Ⱥ

��ȡר�� 10Ԫ��ż�ȯ

˽�� ��ɻ�

��˹�˿�Դ�� grok-1 ��ģ�ͶԱ� openai chatGPT Դ��Ӳ�˵�һ��

��Ѷ

ɨ��

��

�

��Դ

��

��Ѷ�ƿ��

��Ų�Ʒ

��Ƽ�

��Ƽ�

��˹�˿�Դ�� grok-1 ��ģ�ͶԱ� openai chatGPT Դ��Ӳ�˵�һ��

�����Ѷ

����

�

��Դ

����

��Ѷ�ƿ�����

���Ų�Ʒ

�����Ƽ�

�����Ƽ�

��Ѷ

��

��

��Ѷ�ƿ��

��Ų�Ʒ

��Ƽ�

��Ƽ�