This is a brief records of pitfalls and problems we met on GPU environment configuration.

Useful toolkit

# check the cuda version
$ cat /usr/local/cuda/version.txt 

# check the cudnn version
$ cat /usr/local/cuda/include/cudnn.h | grep CUDNN_MAJOR -A 2

# check the GPU driver version
cat /proc/driver/nvidia/version

Ubuntu

Reinstall GPU driver

# Remove old drivers
$ sudo apt-get purge nvidia-*

$ sudo add-apt-repository ppa:graphics-drivers/ppa
$ sudo apt-get update
# If unsure about the driver version, run `ubuntu-drivers devices` to figure out
$ sudo apt-get install nvidia-<driver-version>

$ reboot
$ nvidia-smi

# open nvidia settings -> PRIME Profile 
nvidia-settings

CUDA

Ubuntu

# Download & install cuda 10.1
$ wget https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda-repo-ubuntu1604-10-1-local-10.1.105-418.39_1.0-1_amd64.deb

$ sudo dpkg -i cuda-repo-<name>.deb
$ sudo apt-key add /var/cuda-repo-10-1-local-10.1.105-418.39/7fa2af80.pub 
$ sudo apt-get update
$ sudo apt-get install cuda-10-1

# config ~/.bashrc 
$ echo export PATH=/usr/local/cuda-10.1/bin:$PATH >> ~/.bashrc 
$ echo export LD_LIBRARY_PATH=/usr/local/cuda-10.1/lib64:$LD_LIBRARY_PATH >> ~/.bashrc 
$ source ~/.bashrc

CentOS

$ wget https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda-repo-rhel7-10-1-local-10.1.105-418.39-1.0-1.x86_64.rpm

$ sudo rpm -i cuda-repo-rhel7-10-1-local-10.1.105-418.39-1.0-1.x86_64.rpm
$ sudo yum clean all
# installed at /usr/local/cuda-10.1
$ sudo yum install cuda

# config ~/.bashrc 
$ echo export PATH=/usr/local/cuda-10.1/bin:$PATH >> ~/.bashrc 
$ echo export LD_LIBRARY_PATH=/usr/local/cuda-10.1/lib64:$LD_LIBRARY_PATH >> ~/.bashrc 
$ source ~/.bashrc

Installation

Flax

1
2
3

pip install --upgrade pip
# Installs the wheel compatible with CUDA 11 and cuDNN 8.2 or newer.
pip install --upgrade "jax[cuda]" -f https://storage.googleapis.com/jax-releases/jax_releases.html  # Note: wheels only available on linux.

Check if gpu works:

>>> import jax
>>> jax.devices()

# or 

from jax.lib import xla_bridge
print(xla_bridge.get_backend().platform)

Tensorflow

1
2
3

# Install
# This command will include all required packages including compatible cuda and cudnn.
$ conda create --name tf_gpu tensorflow-gpu[=1.15]

# Check if tensorflow-gpu works
# Build a graph.
>> a = tf.constant(5.0)
>> b = tf.constant(6.0)
>> c = a * b

# Launch the graph in a session.
>> sess = tf.compat.v1.Session() # v2
>> sess = tf.Session() # v1

# Evaluate the tensor `c`.
>> print(sess.run(c))

# Or, only for v1:
>> import tensorflow as tf
>> sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

PyTorch

Install PyTorch (gpu)

1 2	# Install conda install pytorch torchvision cudatoolkit=10.1 -c pytorch

1 2	# Check if pytorch works on gpu >> import torch; torch.cuda.is_available()

NCCL

Install NCCL^[3]

Install from official deb

# 1. Install from local deb
sudo dpkg -i nccl-repo-<version>.deb
# Or, from network deb
# sudo dpkg -i nvidia-machine-learning-repo-<version>.deb

# 2. Update APT database
sudo apt update

# 3. Install the libnccl2 package with APT
sudo apt install libnccl2 libnccl-dev

Apex

Install Apex

$ git clone https://github.com/NVIDIA/apex
$ cd apex
$ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./

# If report error: Given no hashes to check 137 links for project
# Solution:
$ python3 setup.py install

Conda

# conda export yml
conda env export > environment.yml

# conda install from yml
conda env create -f envrionment.yml

Yekun's Note

Deep Learning Toolkit Installation

Useful toolkit

Ubuntu

Reinstall GPU driver

CUDA

Ubuntu

CentOS

Installation

Flax

Tensorflow

PyTorch

NCCL

Apex

Conda

References