LLaMA 安装过程分析

requirements.txt文件内容说明

1
2
3
4
torch #PyTorch开源机器学习框架
fairscale #PyTorch的扩展模块,用于简化和优化分布式训练过程,同时用于高性能和大规模训练的PyTorch扩展库
fire #用于创建命令行接口(Command-Line Interface,CLI)
sentencepiece #开源的文本处理工具,用于进行文本分词和子词划分

LLaMA 模型安装

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
(venv) root@VM-28-11-ubuntu:/opt/llama-main# pip install -r requirements.txt
#执行当前目录下的setup.py文件
(venv) root@VM-28-11-ubuntu:/opt/llama-main# pip install -e .
Looking in indexes: http://mirrors.tencentyun.com/pypi/simple
Obtaining file:///opt/llama-main
Installing collected packages: llama
Running setup.py develop for llama
Successfully installed llama

#pip list查看当前项目已被安装到python环境
(venv) root@VM-28-11-ubuntu:/opt/llama-main# pip list
Package Version Location
------------------------ ---------- ---------------
cmake 3.26.4
fairscale 0.4.13
filelock 3.12.2
fire 0.5.0
Jinja2 3.1.2
lit 16.0.6
llama 0.0.0 /opt/llama-main #LLaMA项目安装到python环境
MarkupSafe 2.1.3
mpmath 1.3.0
networkx 3.1
numpy 1.24.4
nvidia-cublas-cu11 11.10.3.66
nvidia-cuda-cupti-cu11 11.7.101
nvidia-cuda-nvrtc-cu11 11.7.99
nvidia-cuda-runtime-cu11 11.7.99
nvidia-cudnn-cu11 8.5.0.96
nvidia-cufft-cu11 10.9.0.58
nvidia-curand-cu11 10.2.10.91
nvidia-cusolver-cu11 11.4.0.1
nvidia-cusparse-cu11 11.7.4.91
nvidia-nccl-cu11 2.14.3
nvidia-nvtx-cu11 11.7.91
pip 20.0.2
pkg-resources 0.0.0
sentencepiece 0.1.99
setuptools 44.0.0
six 1.16.0
sympy 1.12
termcolor 2.3.0
torch 2.0.1
triton 2.0.0
typing-extensions 4.7.1
wheel 0.40.0

安装过程相关链接

LLaMA 运行过程分析

导入模块

1
2
3
4
5
6
7
8
9
10
11
12
13
from typing import Tuple
import os
import sys
import torch
import fire
import time
import json

from pathlib import Path

from fairscale.nn.model_parallel.initialize import initialize_model_parallel

from llama import ModelArgs, Transformer, Tokenizer, LLaMA

setup_model_parallel函数分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
def setup_model_parallel() -> Tuple[int, int]:
#local_rank表示进程的优先级或序列号
local_rank = int(os.environ.get("LOCAL_RANK", -1))

#world_size表示GPU数量
world_size = int(os.environ.get("WORLD_SIZE", -1))

#使用nccl作为torch分布式后端通讯,可选gloo或mpi
torch.distributed.init_process_group("nccl")

#
initialize_model_parallel(world_size)

#将模型和数据加载到指定GPU
torch.cuda.set_device(local_rank)

# seed must be the same in all processes
#设置随机数生成种子
torch.manual_seed(1)
return local_rank, world_size

load函数分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def load(
ckpt_dir: str,
tokenizer_path: str,
local_rank: int,
world_size: int,
max_seq_len: int,
max_batch_size: int,
) -> LLaMA:
#记录加载开始时间
start_time = time.time()

#找到所有pth文件,即保存的LLaMA模型权重文件
checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))

#检查LLaMA模型权重文件数量是否与world_size表示的GPU数量一致
assert world_size == len(
checkpoints
), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"

#定位第0位份LLaMA模型权重文件
ckpt_path = checkpoints[local_rank]

#开始加载LLaMA模型权重文件到CPU内存
print("Loading")
checkpoint = torch.load(ckpt_path, map_location="cpu")

#读取文件params.json
with open(Path(ckpt_dir) / "params.json", "r") as f:
params = json.loads(f.read())

#加载模型参数
model_args: ModelArgs = ModelArgs(
max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
)

#加载分词器,其主要的任务是将文本输入转化为模型可以接受的输入
tokenizer = Tokenizer(model_path=tokenizer_path)

#指定模型参数vocab_size,基于
model_args.vocab_size = tokenizer.n_words

#设置默认tensor数据类型
torch.set_default_tensor_type(torch.cuda.HalfTensor)

#生成LLaMA模型,基于Transformer模型框架,此时权重用零填充
model = Transformer(model_args)

#设置默认tensor数据类型
torch.set_default_tensor_type(torch.FloatTensor)

#LLaMA模型加载模型权重
model.load_state_dict(checkpoint, strict=False)

#LLaMA模型和分词器组成generator
generator = LLaMA(model, tokenizer)

#打印模型加载时间
print(f"Loaded in {time.time() - start_time:.2f} seconds")

#返回generator
return generator

main函数分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def main(
ckpt_dir: str,
tokenizer_path: str,
temperature: float = 0.8,
top_p: float = 0.95,
max_seq_len: int = 512,
max_batch_size: int = 32,
):
#调用setup_model_parallel函数,得到返回值
local_rank, world_size = setup_model_parallel()

#标准输出到/dev/null设备
if local_rank > 0:
sys.stdout = open(os.devnull, "w")

#调用load函数,得到返回值
generator = load(
ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size
)

#用于测试的提示语prompt
prompts = [
# For these prompts, the expected answer is the natural continuation of the prompt
"I believe the meaning of life is",
"Simply put, the theory of relativity states that ",
"Building a website can be done in 10 simple steps:\n",
# Few shot prompts: https://huggingface.co/blog/few-shot-learning-gpt-neo-and-inference-api
"""Tweet: "I hate it when my phone battery dies."
Sentiment: Negative
###
Tweet: "My day has been 👍"
Sentiment: Positive
###
Tweet: "This is the link to the article"
Sentiment: Neutral
###
Tweet: "This new music video was incredibile"
Sentiment:""",
"""Translate English to French:

sea otter => loutre de mer

peppermint => menthe poivrée

plush girafe => girafe peluche

cheese =>""",
]

#输入到LLaMA模型,得到结果
results = generator.generate(
prompts, max_gen_len=256, temperature=temperature, top_p=top_p
)

#打印结果
for result in results:
print(result)
print("\n==================================\n")

运行过程相关链接

模型权重加载测试

测试环境

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#操作系统版本
ubuntu@VM-28-11-ubuntu:~$ cat /etc/lsb-release
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=20.04
DISTRIB_CODENAME=focal
DISTRIB_DESCRIPTION="Ubuntu 20.04 LTS"
ubuntu@VM-28-11-ubuntu:~$
#python版本
ubuntu@VM-28-11-ubuntu:~$ pip -V
pip 20.0.2 from /usr/lib/python3/dist-packages/pip (python 3.8)
ubuntu@VM-28-11-ubuntu:~$ python -V
Python 3.8.10
#内存
ubuntu@VM-28-11-ubuntu:~$ free -h
total used free shared buff/cache available
Mem: 7.3Gi 203Mi 6.0Gi 1.0Mi 1.0Gi 6.8Gi
Swap: 31Gi 110Mi 31Gi

测试一:分布式后端使用goll并增加SWAP分区

torch.distributed.init_process_group("nccl")修改为torch.distributed.init_process_group("gloo"),即使用CPU设备作为分布式后端。同时为解决内存不足问题使用32G硬盘空间作为swap交换分区。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#增加32G作为swap交换分区
root@VM-28-11-ubuntu:~# free -h
total used free shared buff/cache available
Mem: 7.3Gi 301Mi 3.4Gi 2.0Mi 3.6Gi 6.7Gi
Swap: 0B 0B 0B
root@VM-28-11-ubuntu:~# dd if=/dev/zero of=/newswap bs=1G count=32
32+0 records in
32+0 records out
34359738368 bytes (34 GB, 32 GiB) copied, 152.208 s, 226 MB/s
root@VM-28-11-ubuntu:~# mkswap /newswap
mkswap: /newswap: insecure permissions 0644, 0600 suggested.
Setting up swapspace version 1, size = 32 GiB (34359734272 bytes)
no label, UUID=7ecc0452-08aa-45b4-86c3-16bd1865f080
root@VM-28-11-ubuntu:~# swapon /newswap
swapon: /newswap: insecure permissions 0644, 0600 suggested.
root@VM-28-11-ubuntu:~# free -h
total used free shared buff/cache available
Mem: 7.3Gi 306Mi 1.1Gi 2.0Mi 5.8Gi 6.7Gi
Swap: 31Gi 0B 31Gi

(venv) root@VM-28-11-ubuntu:/opt/llama-main# torchrun --nproc_per_node 1 exampleV1.1.py --ckpt_dir /opt/llama --tokenizer_path /opt/llama/tokenizer.model
> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1
Loading
Loaded in 257.78 seconds #利用了物理内存和SWAP,成功加载模型权重
Traceback (most recent call last):
File "exampleV1.1.py", line 119, in <module>
fire.Fire(main)
File "/opt/llama-main/venv/lib/python3.8/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/opt/llama-main/venv/lib/python3.8/site-packages/fire/core.py", line 475, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/opt/llama-main/venv/lib/python3.8/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "exampleV1.1.py", line 109, in main
results = generator.generate(
File "/opt/llama-main/llama/generation.py", line 43, in generate
logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
File "/opt/llama-main/venv/lib/python3.8/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/opt/llama-main/llama/model.py", line 235, in forward
h = layer(h, start_pos, freqs_cis, mask)
File "/opt/llama-main/venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/llama-main/llama/model.py", line 193, in forward
h = x + self.attention.forward(self.attention_norm(x), start_pos, freqs_cis, mask)
File "/opt/llama-main/llama/model.py", line 121, in forward
xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
File "/opt/llama-main/venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/llama-main/venv/lib/python3.8/site-packages/fairscale/nn/model_parallel/layers.py", line 290, in forward
output_parallel = F.linear(input_parallel, self.weight, self.bias)
RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'

测试结果:

  • 当内存不足时,swap交换分区生效,导入模型权重成功。合计约需要36G内存
  • 分布式后端使用goll时,能在没有GPU环境的机器上加载模型权重

测试二:模型权重加载到GPU显存

checkpoint = torch.load(ckpt_path, map_location="cpu")改为checkpoint = torch.load(ckpt_path, map_location=torch.device('cuda:0'))表示直接加载到GPU显存。此时CPU内存约占用3G空间,GPU显存16G使用率接近100%时,会发生torch.cuda.OutOfMemoryErrorGPU显存不足错误。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
Loading
Traceback (most recent call last):
File "/content/llama-main/example.py", line 120, in <module>
fire.Fire(main)
File "/usr/local/lib/python3.10/dist-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/usr/local/lib/python3.10/dist-packages/fire/core.py", line 475, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/usr/local/lib/python3.10/dist-packages/fire/core.py", line 691, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/content/llama-main/example.py", line 79, in main
generator = load(
File "/content/llama-main/example.py", line 58, in load
model = Transformer(model_args)
File "/content/llama-main/llama/model.py", line 211, in __init__
self.layers.append(TransformerBlock(layer_id, params))
File "/content/llama-main/llama/model.py", line 184, in __init__
self.attention = Attention(args)
File "/content/llama-main/llama/model.py", line 104, in __init__
self.wo = RowParallelLinear(
File "/usr/local/lib/python3.10/dist-packages/fairscale/nn/model_parallel/layers.py", line 349, in __init__
self.weight = Parameter(torch.Tensor(self.out_features, self.input_size_per_partition))
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB (GPU 0; 14.75 GiB total capacity; 14.14 GiB already allocated; 18.81 MiB free; 14.14 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 5080) of binary: /usr/bin/python3
Traceback (most recent call last):
File "/usr/local/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
example.py FAILED
------------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2023-07-12_23:47:34
host : ef7494e0b7ae
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 5080)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================

小结

  1. LLaMA 7B模型权重加载约需要36G内存或SWAP空间,当物理内存不足,可通过增加SWAP分区来解决
  2. LLaMA 代码仓库中的example.py文件,默认使用GPU资源进行推理,若运行环境没有GPU资源会报错RuntimeError: Found no NVIDIA driver on your system