First step - Log into Falcon - either through ondemand or SSH jump box. If you do not already have an account, you can request one here.
The Getting Started Guide has more information about loading modules and the partitions on Falcon.
These nodes were purchased by the I-CREWS Grant, researchers associated with that project have priority access. Your job may be interupted at any time, checkpoint often.
There are a couple options for getting a more modern version of Python to allow us to use libraries/packages that can make good use of the GPUs
Start by opening a terminal to the staging1 server (choose Clusters-> staging Shell Access from the OnDemand interface)
module use /opt/modules/modulefiles
module avail
module load python/3.8.11
pip3 list
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
sha256sum Miniconda3-latest-Linux-x86_64.sh
chmod +x Miniconda3-latest-Linux-x86_64.sh
./Miniconda3-latest-Linux-x86_64.sh
source ~/.bashrc
miniconda3/bin/activate
conda create -n pygpu
conda install python=3.10
pip3 install tensorflow[and-cuda]
pip3 install tensorflow_datasets matplotlib
Now make a directory to organize
mkdir gpu-workshop
cd gpu-workshop
Cluster nodes do not have internet access - you need to download any data prior to submitting the job
Start up the python interpreter:
python3
Then within python, run:
import tensorflow_datasets as tfds
BATCH_SIZE = 64
train_ds = tfds.load('imdb_reviews', split='train[:80%]', batch_size=BATCH_SIZE, shuffle_files=True, as_supervised=True)
exit()
python script to train model - saved as 'sentiment.train.py' Following the Tensorflow example here. The script file can be created through the console by copy/pasting or through the ondemand interface. Update the path to the Tensorflow datasets
#!/bin/python
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
import argparse
import os
import matplotlib.pyplot as plt
parser = argparse.ArgumentParser(description="Trains and saves a tensorflow-keras model for sentiment analysis")
parser.add_argument("-j","--jobid",help="the slurm jobid or other unique number",required=False,default="00000")
args = parser.parse_args()
tfds.disable_progress_bar()
dataset, info = tfds.load('imdb_reviews', data_dir='/lfs/boswald.ui/tensorflow_datasets', with_info=True, as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']
BUFFER_SIZE = 10000
BATCH_SIZE = 64
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))
model = tf.keras.Sequential([
encoder,
tf.keras.layers.Embedding(
input_dim=len(encoder.get_vocabulary()),
output_dim=64,
# Use masking to handle the variable sequence lengths
mask_zero=True),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1)
])
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.Adam(1e-4),
metrics=['accuracy'])
history = model.fit(train_dataset, epochs=3,
validation_data=test_dataset,
validation_steps=30)
test_loss, test_acc = model.evaluate(test_dataset)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)
model.save_weights("sentiment.ckpt"+args.jobid)
#export a plot of the training
def plot_graphs(history, metric):
plt.plot(history.history[metric])
plt.plot(history.history['val_'+metric], '')
plt.xlabel("Epochs")
plt.ylabel(metric)
plt.legend([metric, 'val_'+metric])
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)
plt.savefig(os.getcwd()+"/training_results.png", format='png', dpi=150)
#this saves, but is buggy and can't load the model again:
model.save('sentiment.'+args.jobid)
exit()
#!/bin/bash
#SBATCH -p gpu-volatile
#SBATCH --gres gpu:1
cd $SLURM_SUBMIT_DIR
hostname
module use /opt/modules/modulefiles
module load python/3.8.11 cuda/12.2
START=$(date +%s)
python3 sentiment.train.py -j $SLURM_JOBID
let RUNTIME=$(date +%s)-$START
echo "Training time: $RUNTIME"
echo "*--done--*"
#!/bin/bash
#SBATCH -p gpu-volatile
#SBATCH --gres gpu:1
cd $SLURM_SUBMIT_DIR
hostname
source ~/miniconda3/bin/activate
conda activate pygpu
START=$(date +%s)
python3 sentiment.train.py -j $SLURM_JOBID
let RUNTIME=$(date +%s)-$START
echo "Training time: $RUNTIME"
echo "*--done--*"
Now submit the job
(pygpu) boswald.ui@staging1 ~/gpu-workshop * sbatch sentiment.slurm
Submitted batch job 10092
(pygpu) boswald.ui@staging1 ~/gpu-workshop * squeue --me
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
10092 short sentimen boswald. R 0:07 1 r3i5n0
(pygpu) boswald.ui@staging1 ~/gpu-workshop *
Now lets use the model we trained. First, create the inference file:
#!/bin/python3
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
import argparse
import code
parser = argparse.ArgumentParser(description="Trains and saves a tensorflow-keras model for sentiment analysis")
parser.add_argument("-j","--jobid",help="the slurm jobid or other unique number",required=False,default="00000")
args = parser.parse_args()
tfds.disable_progress_bar()
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']
BUFFER_SIZE = 10000
BATCH_SIZE = 64
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))
model = tf.keras.Sequential([
encoder,
tf.keras.layers.Embedding(
input_dim=len(encoder.get_vocabulary()),
output_dim=64,
# Use masking to handle the variable sequence lengths
mask_zero=True),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1)
])
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.Adam(1e-4),
metrics=['accuracy'])
model.load_weights("sentiment.ckpt"+args.jobid)
print("#--------------------------------------------------------------------#")
print("\nUse the infer function to analyze some text. For example:\ninfer('this is the text to analyze sentiment in',model) \n negative numbers indicate negative sentiment, positive numbers positive sentiment\n")
def infer(thetext, mdl):
predicts = mdl.predict(np.array([thetext]))
print("sentiment: "+str(predicts[0]))
code.interact(local=locals())
exit()
Run the file, using the job number that you used to train the model
ls
python3 inference.py -j 10096
...
(InteractiveConsole)
>>> infer('some text to analyze here',model)
sentiment: [-0.15224136]
>>> infer('happy day, a good movie, fun for all',model)
sentiment: [0.9714721]
>>> infer('i hate apples, they taste like sand',model)
sentiment: [-0.13639668]
>>> infer('puppies are cute - especially when playing with a ball',model)
sentiment: [0.36142796]
>>> exit()
You can use the CUDA module on staging1 to compile with nvcc. Here's an example CUDA C program from this tutorial (saved as vector_add.cu):
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define N 10000000
#define MAX_ERR 1e-6
__global__ void vector_add(float *out, float *a, float *b, int n) {
for(int i = 0; i < n; i ++){
out[i] = a[i] + b[i];
}
}
int main(){
float *a, *b, *out;
float *d_a, *d_b, *d_out;
// Allocate host memory
a = (float*)malloc(sizeof(float) * N);
b = (float*)malloc(sizeof(float) * N);
out = (float*)malloc(sizeof(float) * N);
// Initialize host arrays
for(int i = 0; i < N; i++){
a[i] = 1.0f;
b[i] = 2.0f;
}
// Allocate device memory
cudaMalloc((void**)&d_a, sizeof(float) * N);
cudaMalloc((void**)&d_b, sizeof(float) * N);
cudaMalloc((void**)&d_out, sizeof(float) * N);
// Transfer data from host to device memory
cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
// Executing kernel
vector_add<<<1,1>>>(d_out, d_a, d_b, N);
// Transfer data back to host memory
cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);
// Verification
for(int i = 0; i < N; i++){
assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
}
printf("out[0] = %f\n", out[0]);
printf("PASSED\n");
// Deallocate device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_out);
// Deallocate host memory
free(a);
free(b);
free(out);
}
Load a CUDA module, then compile
module load cuda/12.2
nvcc vector_add.cu -o vector_add
Running the 'vector_add' program on staging1 will fail due to the lack of a GPU device. Here's a slurm submit script to run it on a GPU node:
#!/bin/bash
#SBATCH -p gpu-volatile
#SBATCH --gres gpu:1
cd $SLURM_SUBMIT_DIR
hostname
module use /opt/modules/modulefiles
module load cuda/12.2
./vector_add
echo "*--done--*"
When run successfully, you should get output like:
boswald.ui@staging1 ~/gpu-workshop * cat slurm-241319.out
node03
out[0] = 3.000000
PASSED
*--done--*