【配置】：

python 3.6；pytorch 1.2.0; gpu 2块1080Ti

【教程】：

1. 设置需要使用的GPU编号：

import os
import torch

## 配置GPU编号
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' 
ids = [0,1]

2.用DataParallel包装net

import torch

net.cuda()
net = torch.nn.DataParallel(net, device_ids = ids)

3. 修改CRNN

RNN部分

class BLSTM2(torch.nn.Module):
    def __init__(self, lstminput_size,class_num, hidden_unit = 256):
        super(BLSTM2, self).__init__()
        self.Bidirectional_LSTM1 = torch.nn.LSTM(lstminput_size, lstminput_size//2, 
                                                 bidirectional=True,
                                                 batch_first=True)
        output1_size = lstminput_size
        self.embedding1 = torch.nn.Linear((lstminput_size//2) * 2, output1_size)
        
        self.Bidirectional_LSTM2 = torch.nn.LSTM(output1_size, hidden_unit, 
                                                 bidirectional=True,
                                                 batch_first=True)
        self.embedding2 = torch.nn.Linear(hidden_unit * 2, class_num)

    def forward(self, x):
        self.Bidirectional_LSTM1.flatten_parameters()
        x = self.Bidirectional_LSTM1(x)   
        # x torch.Size([20, 100, 512])
        # LSTM output: output, (h_n, c_n) 
        # output:torch.Size([20, 100, 2*hidden_size = 512])
         
        T, b, h = x[0].size()   # x[0]: (seq_len, batch, num_directions * hidden_size)
        x = self.embedding1(x[0].contiguous().view(T * b, h))   
        # pytorch view() reshape as [T * b, h]
        # [T * b, h] * [ 2 * (lstminput_size//2), output1_size] = [T * b, 512]
        
        x = x.view(T, b, -1)  # [20, 100, 512]
        
        
        self.Bidirectional_LSTM2.flatten_parameters()
        x = self.Bidirectional_LSTM2(x) # x[0]:torch.Size([20, 100, 2*hidden_size = 512])
        T, b, h = x[0].size() 
        x = self.embedding2(x[0].contiguous().view(T * b, h)) 
        # [T * b, h] * [ 2 * hidden_unit, class_num] = [T * b, class_num]
        
        x = x.view(T, b, -1)
        return x  # [20,100,class_num]

解决： UserWarning: RNN module weights are not part of single contiguous chunk of memory. This means they need to be compacted at every call, possibly greately increasing memory usage. To compact weights again call flatten_parameters()

CRNN

class CRNN(torch.nn.Module):
    def __init__(self, imgh,imgw, lstminput_size, class_num, hidden_unit=256):
        super(CRNN, self).__init__()
        self.cnn = torch.nn.Sequential()
        self.cnn.add_module('vgg_16', Vgg_16(imgh,imgw))
        self.rnn = torch.nn.Sequential()
        self.rnn.add_module('rnn', BLSTM2(lstminput_size, class_num, hidden_unit))
        
    def forward(self, x):
        #self.rnn.flatten_parameters()
        x = self.cnn(x)  ##  b, c, h, w torch.Size([100, 512, 1, 20])
        b, c, h, w = x.size()
        #print(x.size())  #: b,c,h,w,(100, 512, 1,20)
        assert h == 1   # "the height of conv must be 1"
        x = x.squeeze(2)  # remove h dimension, b *512 * width torch.Size([100, 512, 20])
        x = x.permute(2, 0, 1)  # [w, b, c] = [seq_len, batch, input_size] torch.Size([20, 100, 512])
        #x = torch.nn.parallel.data_parallel(self.cnn, x, self.ids)
        x = self.rnn(x)
        x = x.permute(1, 0, 2)
        # print(x.size())  # (20, 100, class_num)
        return x

【注意】： pred = net(img).permute(1, 0, 2)

4. 如果有pretrained，加载模型的过程与单块GPU有所不同

state_dict = torch.load(crnn_model_path)
new_state_dict = OrderedDict()
for k, v in state_dict.items():
    if 'module' not in k:
        k = 'module.'+k
    else:
        k = k.replace('features.module.', 'module.features.')
    new_state_dict[k]=v

原文链接：https://blog.csdn.net/weixin_41632154/article/details/106116667