!cp -r ../input/golfdb3/* ./
='../input/golfdb3/test_video.mp4'
stra='../input/golfdb2/golfdb/data/videos_160/1017.mp4'
stra stra
Golf Swing Part II- Separating Swing Positions
Using Python to seperate golf swing into parts of the swing
Overview
In a previous part Part 1 a neural network model was used to find positions on the body during a golf swing. This work used images taken from videos of golf swing (analysed using the code below by the authors listed) because it is often easier to work with images rather than videos.
But to get images of the golf swing to analyse it can be useful to get them at different parts of the golf swing. This is what this part does.
Taken from https://github.com/wmcnally/golfdb and shown in the paper here https://arxiv.org/abs/1903.06528
[Ref Paper] McNally, William, et al. "Golfdb: A video database for golf swing sequencing." Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. 2019
The code separates the golf swing into a number of different segments based on body and golf club positions.
This code can be run on kaggle here https://www.kaggle.com/thomassimm/golfdb-lessimports
The input is an mp3 file of a golf swing
The ouput is a series of images at different parts of the golf swing
The Code
Specify the file to use
Add downloaded directory (not always necsessary) and specify the video file.
Imports, classes and defs
Some imports. Neural nets using Torch
import scipy.io
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
# from eval import ToTensor, Normalize
# from model import EventDetector
import numpy as np
import torch.nn.functional as F
import cv2
from torch.autograd import Variable
The following classes and definitions are taken from the files in the GitHub directory
class SampleVideo(Dataset):
def __init__(self, path, input_size=160, transform=None):
self.path = path
self.input_size = input_size
self.transform = transform
def __len__(self):
return 1
def __getitem__(self, idx):
= cv2.VideoCapture(self.path)
cap = [cap.get(cv2.CAP_PROP_FRAME_HEIGHT), cap.get(cv2.CAP_PROP_FRAME_WIDTH)]
frame_size = self.input_size / max(frame_size)
ratio = tuple([int(x * ratio) for x in frame_size])
new_size = self.input_size - new_size[1]
delta_w = self.input_size - new_size[0]
delta_h = delta_h // 2, delta_h - (delta_h // 2)
top, bottom = delta_w // 2, delta_w - (delta_w // 2)
left, right
# preprocess and return frames
= []
images for pos in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
= cap.read()
_, img = cv2.resize(img, (new_size[1], new_size[0]))
resized = cv2.copyMakeBorder(resized, top, bottom, left, right, cv2.BORDER_CONSTANT,
b_img =[0.406 * 255, 0.456 * 255, 0.485 * 255]) # ImageNet means (BGR)
value
= cv2.cvtColor(b_img, cv2.COLOR_BGR2RGB)
b_img_rgb
images.append(b_img_rgb)
cap.release()= np.zeros(len(images)) # only for compatibility with transforms
labels = {'images': np.asarray(images), 'labels': np.asarray(labels)}
sample if self.transform:
= self.transform(sample)
sample return sample
class ToTensor(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample):
= sample['images'], sample['labels']
images, labels = images.transpose((0, 3, 1, 2))
images return {'images': torch.from_numpy(images).float().div(255.),
'labels': torch.from_numpy(labels).long()}
class Normalize(object):
def __init__(self, mean, std):
self.mean = torch.tensor(mean, dtype=torch.float32)
self.std = torch.tensor(std, dtype=torch.float32)
def __call__(self, sample):
= sample['images'], sample['labels']
images, labels self.mean[None, :, None, None]).div_(self.std[None, :, None, None])
images.sub_(return {'images': images, 'labels': labels}
import torch.nn as nn
import math
"""
https://github.com/tonylins/pytorch-mobilenet-v2
"""
def conv_bn(inp, oup, stride):
return nn.Sequential(
3, stride, 1, bias=False),
nn.Conv2d(inp, oup,
nn.BatchNorm2d(oup),=True)
nn.ReLU6(inplace
)
def conv_1x1_bn(inp, oup):
return nn.Sequential(
1, 1, 0, bias=False),
nn.Conv2d(inp, oup,
nn.BatchNorm2d(oup),=True)
nn.ReLU6(inplace
)
class InvertedResidual(nn.Module):
def __init__(self, inp, oup, stride, expand_ratio):
super(InvertedResidual, self).__init__()
self.stride = stride
assert stride in [1, 2]
= round(inp * expand_ratio)
hidden_dim self.use_res_connect = self.stride == 1 and inp == oup
if expand_ratio == 1:
self.conv = nn.Sequential(
# dw
3, stride, 1, groups=hidden_dim, bias=False),
nn.Conv2d(hidden_dim, hidden_dim,
nn.BatchNorm2d(hidden_dim),=True),
nn.ReLU6(inplace# pw-linear
1, 1, 0, bias=False),
nn.Conv2d(hidden_dim, oup,
nn.BatchNorm2d(oup),
)else:
self.conv = nn.Sequential(
# pw
1, 1, 0, bias=False),
nn.Conv2d(inp, hidden_dim,
nn.BatchNorm2d(hidden_dim),=True),
nn.ReLU6(inplace# dw
3, stride, 1, groups=hidden_dim, bias=False),
nn.Conv2d(hidden_dim, hidden_dim,
nn.BatchNorm2d(hidden_dim),=True),
nn.ReLU6(inplace# pw-linear
1, 1, 0, bias=False),
nn.Conv2d(hidden_dim, oup,
nn.BatchNorm2d(oup),
)
def forward(self, x):
if self.use_res_connect:
return x + self.conv(x)
else:
return self.conv(x)
class MobileNetV2(nn.Module):
def __init__(self, n_class=1000, input_size=224, width_mult=1.):
super(MobileNetV2, self).__init__()
= InvertedResidual
block = 16
min_depth = 32
input_channel = 1280
last_channel = [
interverted_residual_setting # t, c, n, s
1, 16, 1, 1],
[6, 24, 2, 2],
[6, 32, 3, 2],
[6, 64, 4, 2],
[6, 96, 3, 1],
[6, 160, 3, 2],
[6, 320, 1, 1],
[
]
# building first layer
assert input_size % 32 == 0
= int(input_channel * width_mult) if width_mult >= 1.0 else input_channel
input_channel self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
self.features = [conv_bn(3, input_channel, 2)]
# building inverted residual blocks
for t, c, n, s in interverted_residual_setting:
= max(int(c * width_mult), min_depth)
output_channel for i in range(n):
if i == 0:
self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
else:
self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
= output_channel
input_channel # building last several layers
self.features.append(conv_1x1_bn(input_channel, self.last_channel))
# make it nn.Sequential
self.features = nn.Sequential(*self.features)
# building classifier
self.classifier = nn.Sequential(
0.2),
nn.Dropout(self.last_channel, n_class),
nn.Linear(
)
self._initialize_weights()
def forward(self, x):
= self.features(x)
x = x.mean(3).mean(2)
x = self.classifier(x)
x return x
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
= m.kernel_size[0] * m.kernel_size[1] * m.out_channels
n 0, math.sqrt(2. / n))
m.weight.data.normal_(if m.bias is not None:
m.bias.data.zero_()elif isinstance(m, nn.BatchNorm2d):
1)
m.weight.data.fill_(
m.bias.data.zero_()elif isinstance(m, nn.Linear):
= m.weight.size(1)
n 0, 0.01)
m.weight.data.normal_( m.bias.data.zero_()
import torch.nn as nn
class EventDetector(nn.Module):
def __init__(self, pretrain, width_mult, lstm_layers, lstm_hidden, bidirectional=True, dropout=True):
super(EventDetector, self).__init__()
self.width_mult = width_mult
self.lstm_layers = lstm_layers
self.lstm_hidden = lstm_hidden
self.bidirectional = bidirectional
self.dropout = dropout
= MobileNetV2(width_mult=width_mult)
net = torch.load('mobilenet_v2.pth.tar')
state_dict_mobilenet if pretrain:
net.load_state_dict(state_dict_mobilenet)
self.cnn = nn.Sequential(*list(net.children())[0][:19])
self.rnn = nn.LSTM(int(1280*width_mult if width_mult > 1.0 else 1280),
self.lstm_hidden, self.lstm_layers,
=True, bidirectional=bidirectional)
batch_firstif self.bidirectional:
self.lin = nn.Linear(2*self.lstm_hidden, 9)
else:
self.lin = nn.Linear(self.lstm_hidden, 9)
if self.dropout:
self.drop = nn.Dropout(0.5)
def init_hidden(self, batch_size):
if self.bidirectional:
return (Variable(torch.zeros(2*self.lstm_layers, batch_size, self.lstm_hidden).cuda(), requires_grad=True),
2*self.lstm_layers, batch_size, self.lstm_hidden).cuda(), requires_grad=True))
Variable(torch.zeros(else:
return (Variable(torch.zeros(self.lstm_layers, batch_size, self.lstm_hidden).cuda(), requires_grad=True),
self.lstm_layers, batch_size, self.lstm_hidden).cuda(), requires_grad=True))
Variable(torch.zeros(
def forward(self, x, lengths=None):
= x.size()
batch_size, timesteps, C, H, W self.hidden = self.init_hidden(batch_size)
# CNN forward
= x.view(batch_size * timesteps, C, H, W)
c_in = self.cnn(c_in)
c_out = c_out.mean(3).mean(2)
c_out if self.dropout:
= self.drop(c_out)
c_out
# LSTM forward
= c_out.view(batch_size, timesteps, -1)
r_in = self.rnn(r_in, self.hidden)
r_out, states = self.lin(r_out)
out = out.view(batch_size*timesteps,9)
out
return out
Run the code
=64
seq_length
= SampleVideo(stra, transform=transforms.Compose([ToTensor(),
ds 0.485, 0.456, 0.406],
Normalize([0.229, 0.224, 0.225])]))
[
= DataLoader(ds, batch_size=1, shuffle=False, drop_last=False)
dl
= EventDetector(pretrain=True,
model =1.,
width_mult=1,
lstm_layers=256,
lstm_hidden=True,
bidirectional=False)
dropouttry:
= torch.load('models/swingnet_1800.pth.tar')
save_dict except:
print("Model weights not found. Download model weights and place in 'models' folder. See README for instructions")
= torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device print('Using device:', device)
'model_state_dict'])
model.load_state_dict(save_dict[
model.to(device)eval()
model.print("Loaded model weights")
print('Testing...')
for sample in dl:
= sample['images']
images # full samples do not fit into GPU memory so evaluate sample in 'seq_length' batches
= 0
batch while batch * seq_length < images.shape[1]:
if (batch + 1) * seq_length > images.shape[1]:
= images[:, batch * seq_length:, :, :, :]
image_batch else:
= images[:, batch * seq_length:(batch + 1) * seq_length, :, :, :]
image_batch = model(image_batch.cuda())
logits if batch == 0:
= F.softmax(logits.data, dim=1).cpu().numpy()
probs else:
= np.append(probs, F.softmax(logits.data, dim=1).cpu().numpy(), 0)
probs += 1
batch
= np.argmax(probs, axis=0)[:-1]
events print('Predicted event frames: {}'.format(events))
= []
confidence for i, e in enumerate(events):
confidence.append(probs[e, i])print('Confidence: {}'.format([np.round(c, 3) for c in confidence]))
Output:
Using device: cuda
Loaded model weights
Testing…
Predicted event frames: [ 82 121 137 166 189 203 213 245]
Confidence: [0.215, 0.376, 0.79, 0.767, 0.827, 0.968, 0.935, 0.247]
Plot the results
import os
##delte images
=os.listdir()
lsa=[ ll for ll in lsa if ll.split('.')[-1]=='jpg']
fimg# print(fimg)
=[os.remove(ff) for ff in fimg]
imgs
=[ ll for ll in lsa if ll.split('.')[-1]=='jpg'] fimg
def createImages(fila,pos,nomS):
'''
Given a video file location (fila) it will save as images to a folder
Given positions in video (pos) these images from the video are saved
pos is created based on positions of swings
'''
import cv2
= cv2.VideoCapture(fila)
cap =[0,1,2,3,4,5,6,7]
eventNomfor i, e in enumerate(events):
set(cv2.CAP_PROP_POS_FRAMES, e)
cap.= cap.read()
_, img '_'+ nomS+'_'+"frame{:d}.jpg".format(eventNom[i])), img) # save frame as JPG file
cv2.imwrite(os.path.join(os.getcwd(),
=stra
fila=events
pos'10') createImages(fila,pos,
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
=os.listdir()
lsa=[ ll for ll in lsa if ll.split('.')[-1]=='jpg']
fimg
fimg.sort()
=[mpimg.imread(ff) for ff in fimg] imgs
= cv2.VideoCapture(stra)
cap
# plt.subplot(4,2,1)
= plt.subplots(4,2,figsize=(15,15))
f, axs for i, e in enumerate(events):
set(cv2.CAP_PROP_POS_FRAMES, e)
cap.= cap.read()
_, img 4,2,i+1)
plt.subplot(
plt.imshow(img) plt.title(e)