init - 初始化项目
This commit is contained in:
22
samples/dnn/CMakeLists.txt
Normal file
22
samples/dnn/CMakeLists.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
ocv_install_example_src(dnn *.cpp *.hpp CMakeLists.txt)
|
||||
|
||||
set(OPENCV_DNN_SAMPLES_REQUIRED_DEPS
|
||||
opencv_core
|
||||
opencv_imgproc
|
||||
opencv_dnn
|
||||
opencv_imgcodecs
|
||||
opencv_videoio
|
||||
opencv_highgui)
|
||||
ocv_check_dependencies(${OPENCV_DNN_SAMPLES_REQUIRED_DEPS})
|
||||
|
||||
if(NOT BUILD_EXAMPLES OR NOT OCV_DEPENDENCIES_FOUND)
|
||||
return()
|
||||
endif()
|
||||
|
||||
project(dnn_samples)
|
||||
ocv_include_modules_recurse(${OPENCV_DNN_SAMPLES_REQUIRED_DEPS})
|
||||
file(GLOB_RECURSE dnn_samples RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
|
||||
foreach(sample_filename ${dnn_samples})
|
||||
ocv_define_sample(tgt ${sample_filename} dnn)
|
||||
ocv_target_link_libraries(${tgt} PRIVATE ${OPENCV_LINKER_LIBS} ${OPENCV_DNN_SAMPLES_REQUIRED_DEPS})
|
||||
endforeach()
|
||||
84
samples/dnn/README.md
Normal file
84
samples/dnn/README.md
Normal file
@@ -0,0 +1,84 @@
|
||||
# OpenCV deep learning module samples
|
||||
|
||||
## Model Zoo
|
||||
|
||||
Check [a wiki](https://github.com/opencv/opencv/wiki/Deep-Learning-in-OpenCV) for a list of tested models.
|
||||
|
||||
If OpenCV is built with [Intel's Inference Engine support](https://github.com/opencv/opencv/wiki/Intel%27s-Deep-Learning-Inference-Engine-backend) you can use [Intel's pre-trained](https://github.com/opencv/open_model_zoo) models.
|
||||
|
||||
There are different preprocessing parameters such mean subtraction or scale factors for different models.
|
||||
You may check the most popular models and their parameters at [models.yml](https://github.com/opencv/opencv/blob/master/samples/dnn/models.yml) configuration file. It might be also used for aliasing samples parameters. In example,
|
||||
|
||||
```bash
|
||||
python object_detection.py opencv_fd --model /path/to/caffemodel --config /path/to/prototxt
|
||||
```
|
||||
|
||||
Check `-h` option to know which values are used by default:
|
||||
|
||||
```bash
|
||||
python object_detection.py opencv_fd -h
|
||||
```
|
||||
|
||||
### Sample models
|
||||
|
||||
You can download sample models using ```download_models.py```. For example, the following command will download network weights for OpenCV Face Detector model and store them in FaceDetector folder:
|
||||
|
||||
```bash
|
||||
python download_models.py --save_dir FaceDetector opencv_fd
|
||||
```
|
||||
|
||||
You can use default configuration files adopted for OpenCV from [here](https://github.com/opencv/opencv_extra/tree/master/testdata/dnn).
|
||||
|
||||
You also can use the script to download necessary files from your code. Assume you have the following code inside ```your_script.py```:
|
||||
|
||||
```python
|
||||
from download_models import downloadFile
|
||||
|
||||
filepath1 = downloadFile("https://drive.google.com/uc?export=download&id=0B3gersZ2cHIxRm5PMWRoTkdHdHc", None, filename="MobileNetSSD_deploy.caffemodel", save_dir="save_dir_1")
|
||||
filepath2 = downloadFile("https://drive.google.com/uc?export=download&id=0B3gersZ2cHIxRm5PMWRoTkdHdHc", "994d30a8afaa9e754d17d2373b2d62a7dfbaaf7a", filename="MobileNetSSD_deploy.caffemodel")
|
||||
print(filepath1)
|
||||
print(filepath2)
|
||||
# Your code
|
||||
```
|
||||
|
||||
By running the following commands, you will get **MobileNetSSD_deploy.caffemodel** file:
|
||||
```bash
|
||||
export OPENCV_DOWNLOAD_DATA_PATH=download_folder
|
||||
python your_script.py
|
||||
```
|
||||
|
||||
**Note** that you can provide a directory using **save_dir** parameter or via **OPENCV_SAVE_DIR** environment variable.
|
||||
|
||||
#### Face detection
|
||||
[An origin model](https://github.com/opencv/opencv/tree/master/samples/dnn/face_detector)
|
||||
with single precision floating point weights has been quantized using [TensorFlow framework](https://www.tensorflow.org/).
|
||||
To achieve the best accuracy run the model on BGR images resized to `300x300` applying mean subtraction
|
||||
of values `(104, 177, 123)` for each blue, green and red channels correspondingly.
|
||||
|
||||
The following are accuracy metrics obtained using [COCO object detection evaluation
|
||||
tool](http://cocodataset.org/#detections-eval) on [FDDB dataset](http://vis-www.cs.umass.edu/fddb/)
|
||||
(see [script](https://github.com/opencv/opencv/blob/master/modules/dnn/misc/face_detector_accuracy.py))
|
||||
applying resize to `300x300` and keeping an origin images' sizes.
|
||||
```
|
||||
AP - Average Precision | FP32/FP16 | UINT8 | FP32/FP16 | UINT8 |
|
||||
AR - Average Recall | 300x300 | 300x300 | any size | any size |
|
||||
--------------------------------------------------|-----------|----------------|-----------|----------------|
|
||||
AP @[ IoU=0.50:0.95 | area= all | maxDets=100 ] | 0.408 | 0.408 | 0.378 | 0.328 (-0.050) |
|
||||
AP @[ IoU=0.50 | area= all | maxDets=100 ] | 0.849 | 0.849 | 0.797 | 0.790 (-0.007) |
|
||||
AP @[ IoU=0.75 | area= all | maxDets=100 ] | 0.251 | 0.251 | 0.208 | 0.140 (-0.068) |
|
||||
AP @[ IoU=0.50:0.95 | area= small | maxDets=100 ] | 0.050 | 0.051 (+0.001) | 0.107 | 0.070 (-0.037) |
|
||||
AP @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] | 0.381 | 0.379 (-0.002) | 0.380 | 0.368 (-0.012) |
|
||||
AP @[ IoU=0.50:0.95 | area= large | maxDets=100 ] | 0.455 | 0.455 | 0.412 | 0.337 (-0.075) |
|
||||
AR @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] | 0.299 | 0.299 | 0.279 | 0.246 (-0.033) |
|
||||
AR @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] | 0.482 | 0.482 | 0.476 | 0.436 (-0.040) |
|
||||
AR @[ IoU=0.50:0.95 | area= all | maxDets=100 ] | 0.496 | 0.496 | 0.491 | 0.451 (-0.040) |
|
||||
AR @[ IoU=0.50:0.95 | area= small | maxDets=100 ] | 0.189 | 0.193 (+0.004) | 0.284 | 0.232 (-0.052) |
|
||||
AR @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] | 0.481 | 0.480 (-0.001) | 0.470 | 0.458 (-0.012) |
|
||||
AR @[ IoU=0.50:0.95 | area= large | maxDets=100 ] | 0.528 | 0.528 | 0.520 | 0.462 (-0.058) |
|
||||
```
|
||||
|
||||
## References
|
||||
* [Models downloading script](https://github.com/opencv/opencv/samples/dnn/download_models.py)
|
||||
* [Configuration files adopted for OpenCV](https://github.com/opencv/opencv_extra/tree/master/testdata/dnn)
|
||||
* [How to import models from TensorFlow Object Detection API](https://github.com/opencv/opencv/wiki/TensorFlow-Object-Detection-API)
|
||||
* [Names of classes from different datasets](https://github.com/opencv/opencv/tree/master/samples/data/dnn)
|
||||
82
samples/dnn/action_recognition.py
Normal file
82
samples/dnn/action_recognition.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import os
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
import argparse
|
||||
from common import findFile
|
||||
|
||||
parser = argparse.ArgumentParser(description='Use this script to run action recognition using 3D ResNet34',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--input', '-i', help='Path to input video file. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--model', required=True, help='Path to model.')
|
||||
parser.add_argument('--classes', default=findFile('action_recongnition_kinetics.txt'), help='Path to classes list.')
|
||||
|
||||
# To get net download original repository https://github.com/kenshohara/video-classification-3d-cnn-pytorch
|
||||
# For correct ONNX export modify file: video-classification-3d-cnn-pytorch/models/resnet.py
|
||||
# change
|
||||
# - def downsample_basic_block(x, planes, stride):
|
||||
# - out = F.avg_pool3d(x, kernel_size=1, stride=stride)
|
||||
# - zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
|
||||
# - out.size(2), out.size(3),
|
||||
# - out.size(4)).zero_()
|
||||
# - if isinstance(out.data, torch.cuda.FloatTensor):
|
||||
# - zero_pads = zero_pads.cuda()
|
||||
# -
|
||||
# - out = Variable(torch.cat([out.data, zero_pads], dim=1))
|
||||
# - return out
|
||||
|
||||
# To
|
||||
# + def downsample_basic_block(x, planes, stride):
|
||||
# + out = F.avg_pool3d(x, kernel_size=1, stride=stride)
|
||||
# + out = F.pad(out, (0, 0, 0, 0, 0, 0, 0, int(planes - out.size(1)), 0, 0), "constant", 0)
|
||||
# + return out
|
||||
|
||||
# To ONNX export use torch.onnx.export(model, inputs, model_name)
|
||||
|
||||
def get_class_names(path):
|
||||
class_names = []
|
||||
with open(path) as f:
|
||||
for row in f:
|
||||
class_names.append(row[:-1])
|
||||
return class_names
|
||||
|
||||
def classify_video(video_path, net_path):
|
||||
SAMPLE_DURATION = 16
|
||||
SAMPLE_SIZE = 112
|
||||
mean = (114.7748, 107.7354, 99.4750)
|
||||
class_names = get_class_names(args.classes)
|
||||
|
||||
net = cv.dnn.readNet(net_path)
|
||||
net.setPreferableBackend(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)
|
||||
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
|
||||
|
||||
winName = 'Deep learning image classification in OpenCV'
|
||||
cv.namedWindow(winName, cv.WINDOW_AUTOSIZE)
|
||||
cap = cv.VideoCapture(video_path)
|
||||
while cv.waitKey(1) < 0:
|
||||
frames = []
|
||||
for _ in range(SAMPLE_DURATION):
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
exit(0)
|
||||
frames.append(frame)
|
||||
|
||||
inputs = cv.dnn.blobFromImages(frames, 1, (SAMPLE_SIZE, SAMPLE_SIZE), mean, True, crop=True)
|
||||
inputs = np.transpose(inputs, (1, 0, 2, 3))
|
||||
inputs = np.expand_dims(inputs, axis=0)
|
||||
net.setInput(inputs)
|
||||
outputs = net.forward()
|
||||
class_pred = np.argmax(outputs)
|
||||
label = class_names[class_pred]
|
||||
|
||||
for frame in frames:
|
||||
labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
|
||||
cv.rectangle(frame, (0, 10 - labelSize[1]),
|
||||
(labelSize[0], 10 + baseLine), (255, 255, 255), cv.FILLED)
|
||||
cv.putText(frame, label, (0, 10), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
|
||||
cv.imshow(winName, frame)
|
||||
if cv.waitKey(1) & 0xFF == ord('q'):
|
||||
break
|
||||
|
||||
if __name__ == "__main__":
|
||||
args, _ = parser.parse_known_args()
|
||||
classify_video(args.input if args.input else 0, args.model)
|
||||
165
samples/dnn/classification.cpp
Normal file
165
samples/dnn/classification.cpp
Normal file
@@ -0,0 +1,165 @@
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
std::string keys =
|
||||
"{ help h | | Print help message. }"
|
||||
"{ @alias | | An alias name of model to extract preprocessing parameters from models.yml file. }"
|
||||
"{ zoo | models.yml | An optional path to file with preprocessing parameters }"
|
||||
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
|
||||
"{ initial_width | 0 | Preprocess input image by initial resizing to a specific width.}"
|
||||
"{ initial_height | 0 | Preprocess input image by initial resizing to a specific height.}"
|
||||
"{ std | 0.0 0.0 0.0 | Preprocess input image by dividing on a standard deviation.}"
|
||||
"{ crop | false | Preprocess input image by center cropping.}"
|
||||
"{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
|
||||
"{ classes | | Optional path to a text file with names of classes. }"
|
||||
"{ backend | 0 | Choose one of computation backends: "
|
||||
"0: automatically (by default), "
|
||||
"1: Halide language (http://halide-lang.org/), "
|
||||
"2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"3: OpenCV implementation }"
|
||||
"{ target | 0 | Choose one of target computation devices: "
|
||||
"0: CPU target (by default), "
|
||||
"1: OpenCL, "
|
||||
"2: OpenCL fp16 (half-float precision), "
|
||||
"3: VPU }";
|
||||
|
||||
using namespace cv;
|
||||
using namespace dnn;
|
||||
|
||||
std::vector<std::string> classes;
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
|
||||
const std::string modelName = parser.get<String>("@alias");
|
||||
const std::string zooFile = parser.get<String>("zoo");
|
||||
|
||||
keys += genPreprocArguments(modelName, zooFile);
|
||||
|
||||
parser = CommandLineParser(argc, argv, keys);
|
||||
parser.about("Use this script to run classification deep learning networks using OpenCV.");
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int rszWidth = parser.get<int>("initial_width");
|
||||
int rszHeight = parser.get<int>("initial_height");
|
||||
float scale = parser.get<float>("scale");
|
||||
Scalar mean = parser.get<Scalar>("mean");
|
||||
Scalar std = parser.get<Scalar>("std");
|
||||
bool swapRB = parser.get<bool>("rgb");
|
||||
bool crop = parser.get<bool>("crop");
|
||||
int inpWidth = parser.get<int>("width");
|
||||
int inpHeight = parser.get<int>("height");
|
||||
String model = findFile(parser.get<String>("model"));
|
||||
String config = findFile(parser.get<String>("config"));
|
||||
String framework = parser.get<String>("framework");
|
||||
int backendId = parser.get<int>("backend");
|
||||
int targetId = parser.get<int>("target");
|
||||
|
||||
// Open file with classes names.
|
||||
if (parser.has("classes"))
|
||||
{
|
||||
std::string file = parser.get<String>("classes");
|
||||
std::ifstream ifs(file.c_str());
|
||||
if (!ifs.is_open())
|
||||
CV_Error(Error::StsError, "File " + file + " not found");
|
||||
std::string line;
|
||||
while (std::getline(ifs, line))
|
||||
{
|
||||
classes.push_back(line);
|
||||
}
|
||||
}
|
||||
|
||||
if (!parser.check())
|
||||
{
|
||||
parser.printErrors();
|
||||
return 1;
|
||||
}
|
||||
CV_Assert(!model.empty());
|
||||
|
||||
//! [Read and initialize network]
|
||||
Net net = readNet(model, config, framework);
|
||||
net.setPreferableBackend(backendId);
|
||||
net.setPreferableTarget(targetId);
|
||||
//! [Read and initialize network]
|
||||
|
||||
// Create a window
|
||||
static const std::string kWinName = "Deep learning image classification in OpenCV";
|
||||
namedWindow(kWinName, WINDOW_NORMAL);
|
||||
|
||||
//! [Open a video file or an image file or a camera stream]
|
||||
VideoCapture cap;
|
||||
if (parser.has("input"))
|
||||
cap.open(parser.get<String>("input"));
|
||||
else
|
||||
cap.open(0);
|
||||
//! [Open a video file or an image file or a camera stream]
|
||||
|
||||
// Process frames.
|
||||
Mat frame, blob;
|
||||
while (waitKey(1) < 0)
|
||||
{
|
||||
cap >> frame;
|
||||
if (frame.empty())
|
||||
{
|
||||
waitKey();
|
||||
break;
|
||||
}
|
||||
|
||||
if (rszWidth != 0 && rszHeight != 0)
|
||||
{
|
||||
resize(frame, frame, Size(rszWidth, rszHeight));
|
||||
}
|
||||
|
||||
//! [Create a 4D blob from a frame]
|
||||
blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, crop);
|
||||
|
||||
// Check std values.
|
||||
if (std.val[0] != 0.0 && std.val[1] != 0.0 && std.val[2] != 0.0)
|
||||
{
|
||||
// Divide blob by std.
|
||||
divide(blob, std, blob);
|
||||
}
|
||||
//! [Create a 4D blob from a frame]
|
||||
|
||||
//! [Set input blob]
|
||||
net.setInput(blob);
|
||||
//! [Set input blob]
|
||||
//! [Make forward pass]
|
||||
Mat prob = net.forward();
|
||||
//! [Make forward pass]
|
||||
|
||||
//! [Get a class with a highest score]
|
||||
Point classIdPoint;
|
||||
double confidence;
|
||||
minMaxLoc(prob.reshape(1, 1), 0, &confidence, 0, &classIdPoint);
|
||||
int classId = classIdPoint.x;
|
||||
//! [Get a class with a highest score]
|
||||
|
||||
// Put efficiency information.
|
||||
std::vector<double> layersTimes;
|
||||
double freq = getTickFrequency() / 1000;
|
||||
double t = net.getPerfProfile(layersTimes) / freq;
|
||||
std::string label = format("Inference time: %.2f ms", t);
|
||||
putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
|
||||
// Print predicted class.
|
||||
label = format("%s: %.4f", (classes.empty() ? format("Class #%d", classId).c_str() :
|
||||
classes[classId].c_str()),
|
||||
confidence);
|
||||
putText(frame, label, Point(0, 40), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
|
||||
imshow(kWinName, frame);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
112
samples/dnn/classification.py
Normal file
112
samples/dnn/classification.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import argparse
|
||||
|
||||
import cv2 as cv
|
||||
import numpy as np
|
||||
from common import *
|
||||
|
||||
|
||||
def get_args_parser(func_args):
|
||||
backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE,
|
||||
cv.dnn.DNN_BACKEND_OPENCV)
|
||||
targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD,
|
||||
cv.dnn.DNN_TARGET_HDDL)
|
||||
|
||||
parser = argparse.ArgumentParser(add_help=False)
|
||||
parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
|
||||
help='An optional path to file with preprocessing parameters.')
|
||||
parser.add_argument('--input',
|
||||
help='Path to input image or video file. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet'],
|
||||
help='Optional name of an origin framework of the model. '
|
||||
'Detect it automatically if it does not set.')
|
||||
parser.add_argument('--std', nargs='*', type=float,
|
||||
help='Preprocess input image by dividing on a standard deviation.')
|
||||
parser.add_argument('--crop', type=bool, default=False,
|
||||
help='Preprocess input image by dividing on a standard deviation.')
|
||||
parser.add_argument('--initial_width', type=int,
|
||||
help='Preprocess input image by initial resizing to a specific width.')
|
||||
parser.add_argument('--initial_height', type=int,
|
||||
help='Preprocess input image by initial resizing to a specific height.')
|
||||
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
|
||||
help="Choose one of computation backends: "
|
||||
"%d: automatically (by default), "
|
||||
"%d: Halide language (http://halide-lang.org/), "
|
||||
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"%d: OpenCV implementation" % backends)
|
||||
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
|
||||
help='Choose one of target computation devices: '
|
||||
'%d: CPU target (by default), '
|
||||
'%d: OpenCL, '
|
||||
'%d: OpenCL fp16 (half-float precision), '
|
||||
'%d: NCS2 VPU, '
|
||||
'%d: HDDL VPU' % targets)
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
add_preproc_args(args.zoo, parser, 'classification')
|
||||
parser = argparse.ArgumentParser(parents=[parser],
|
||||
description='Use this script to run classification deep learning networks using OpenCV.',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
return parser.parse_args(func_args)
|
||||
|
||||
|
||||
def main(func_args=None):
|
||||
args = get_args_parser(func_args)
|
||||
args.model = findFile(args.model)
|
||||
args.config = findFile(args.config)
|
||||
args.classes = findFile(args.classes)
|
||||
|
||||
# Load names of classes
|
||||
classes = None
|
||||
if args.classes:
|
||||
with open(args.classes, 'rt') as f:
|
||||
classes = f.read().rstrip('\n').split('\n')
|
||||
|
||||
# Load a network
|
||||
net = cv.dnn.readNet(args.model, args.config, args.framework)
|
||||
net.setPreferableBackend(args.backend)
|
||||
net.setPreferableTarget(args.target)
|
||||
|
||||
winName = 'Deep learning image classification in OpenCV'
|
||||
cv.namedWindow(winName, cv.WINDOW_NORMAL)
|
||||
|
||||
cap = cv.VideoCapture(args.input if args.input else 0)
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
cv.waitKey()
|
||||
break
|
||||
|
||||
# Create a 4D blob from a frame.
|
||||
inpWidth = args.width if args.width else frame.shape[1]
|
||||
inpHeight = args.height if args.height else frame.shape[0]
|
||||
|
||||
if args.initial_width and args.initial_height:
|
||||
frame = cv.resize(frame, (args.initial_width, args.initial_height))
|
||||
|
||||
blob = cv.dnn.blobFromImage(frame, args.scale, (inpWidth, inpHeight), args.mean, args.rgb, crop=args.crop)
|
||||
if args.std:
|
||||
blob[0] /= np.asarray(args.std, dtype=np.float32).reshape(3, 1, 1)
|
||||
|
||||
# Run a model
|
||||
net.setInput(blob)
|
||||
out = net.forward()
|
||||
|
||||
# Get a class with a highest score.
|
||||
out = out.flatten()
|
||||
classId = np.argmax(out)
|
||||
confidence = out[classId]
|
||||
|
||||
# Put efficiency information.
|
||||
t, _ = net.getPerfProfile()
|
||||
label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
|
||||
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
# Print predicted class.
|
||||
label = '%s: %.4f' % (classes[classId] if classes else 'Class #%d' % classId, confidence)
|
||||
cv.putText(frame, label, (0, 40), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
cv.imshow(winName, frame)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
128
samples/dnn/colorization.cpp
Normal file
128
samples/dnn/colorization.cpp
Normal file
@@ -0,0 +1,128 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <iostream>
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::dnn;
|
||||
using namespace std;
|
||||
|
||||
// the 313 ab cluster centers from pts_in_hull.npy (already transposed)
|
||||
static float hull_pts[] = {
|
||||
-90., -90., -90., -90., -90., -80., -80., -80., -80., -80., -80., -80., -80., -70., -70., -70., -70., -70., -70., -70., -70.,
|
||||
-70., -70., -60., -60., -60., -60., -60., -60., -60., -60., -60., -60., -60., -60., -50., -50., -50., -50., -50., -50., -50., -50.,
|
||||
-50., -50., -50., -50., -50., -50., -40., -40., -40., -40., -40., -40., -40., -40., -40., -40., -40., -40., -40., -40., -40., -30.,
|
||||
-30., -30., -30., -30., -30., -30., -30., -30., -30., -30., -30., -30., -30., -30., -30., -20., -20., -20., -20., -20., -20., -20.,
|
||||
-20., -20., -20., -20., -20., -20., -20., -20., -20., -10., -10., -10., -10., -10., -10., -10., -10., -10., -10., -10., -10., -10.,
|
||||
-10., -10., -10., -10., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 10., 10., 10., 10., 10., 10., 10.,
|
||||
10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 20., 20., 20., 20., 20., 20., 20., 20., 20., 20., 20., 20., 20., 20., 20.,
|
||||
20., 20., 20., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 40., 40., 40., 40.,
|
||||
40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 50., 50., 50., 50., 50., 50., 50., 50., 50., 50.,
|
||||
50., 50., 50., 50., 50., 50., 50., 50., 50., 60., 60., 60., 60., 60., 60., 60., 60., 60., 60., 60., 60., 60., 60., 60., 60., 60.,
|
||||
60., 60., 60., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 80., 80., 80.,
|
||||
80., 80., 80., 80., 80., 80., 80., 80., 80., 80., 80., 80., 80., 80., 80., 80., 90., 90., 90., 90., 90., 90., 90., 90., 90., 90.,
|
||||
90., 90., 90., 90., 90., 90., 90., 90., 90., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 50., 60., 70., 80., 90.,
|
||||
20., 30., 40., 50., 60., 70., 80., 90., 0., 10., 20., 30., 40., 50., 60., 70., 80., 90., -20., -10., 0., 10., 20., 30., 40., 50.,
|
||||
60., 70., 80., 90., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., 80., 90., 100., -40., -30., -20., -10., 0., 10., 20.,
|
||||
30., 40., 50., 60., 70., 80., 90., 100., -50., -40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., 80., 90., 100., -50.,
|
||||
-40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., 80., 90., 100., -60., -50., -40., -30., -20., -10., 0., 10., 20.,
|
||||
30., 40., 50., 60., 70., 80., 90., 100., -70., -60., -50., -40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., 80., 90.,
|
||||
100., -80., -70., -60., -50., -40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., 80., 90., -80., -70., -60., -50.,
|
||||
-40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., 80., 90., -90., -80., -70., -60., -50., -40., -30., -20., -10.,
|
||||
0., 10., 20., 30., 40., 50., 60., 70., 80., 90., -100., -90., -80., -70., -60., -50., -40., -30., -20., -10., 0., 10., 20., 30.,
|
||||
40., 50., 60., 70., 80., 90., -100., -90., -80., -70., -60., -50., -40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70.,
|
||||
80., -110., -100., -90., -80., -70., -60., -50., -40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., 80., -110., -100.,
|
||||
-90., -80., -70., -60., -50., -40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., 80., -110., -100., -90., -80., -70.,
|
||||
-60., -50., -40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., -110., -100., -90., -80., -70., -60., -50., -40., -30.,
|
||||
-20., -10., 0., 10., 20., 30., 40., 50., 60., 70., -90., -80., -70., -60., -50., -40., -30., -20., -10., 0.
|
||||
};
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
const string about =
|
||||
"This sample demonstrates recoloring grayscale images with dnn.\n"
|
||||
"This program is based on:\n"
|
||||
" http://richzhang.github.io/colorization\n"
|
||||
" https://github.com/richzhang/colorization\n"
|
||||
"Download caffemodel and prototxt files:\n"
|
||||
" http://eecs.berkeley.edu/~rich.zhang/projects/2016_colorization/files/demo_v2/colorization_release_v2.caffemodel\n"
|
||||
" https://raw.githubusercontent.com/richzhang/colorization/master/colorization/models/colorization_deploy_v2.prototxt\n";
|
||||
const string keys =
|
||||
"{ h help | | print this help message }"
|
||||
"{ proto | colorization_deploy_v2.prototxt | model configuration }"
|
||||
"{ model | colorization_release_v2.caffemodel | model weights }"
|
||||
"{ image | space_shuttle.jpg | path to image file }"
|
||||
"{ opencl | | enable OpenCL }";
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
parser.about(about);
|
||||
if (parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
string modelTxt = samples::findFile(parser.get<string>("proto"));
|
||||
string modelBin = samples::findFile(parser.get<string>("model"));
|
||||
string imageFile = samples::findFile(parser.get<string>("image"));
|
||||
bool useOpenCL = parser.has("opencl");
|
||||
if (!parser.check())
|
||||
{
|
||||
parser.printErrors();
|
||||
return 1;
|
||||
}
|
||||
|
||||
Mat img = imread(imageFile);
|
||||
if (img.empty())
|
||||
{
|
||||
cout << "Can't read image from file: " << imageFile << endl;
|
||||
return 2;
|
||||
}
|
||||
|
||||
// fixed input size for the pretrained network
|
||||
const int W_in = 224;
|
||||
const int H_in = 224;
|
||||
Net net = dnn::readNetFromCaffe(modelTxt, modelBin);
|
||||
if (useOpenCL)
|
||||
net.setPreferableTarget(DNN_TARGET_OPENCL);
|
||||
|
||||
// setup additional layers:
|
||||
int sz[] = {2, 313, 1, 1};
|
||||
const Mat pts_in_hull(4, sz, CV_32F, hull_pts);
|
||||
Ptr<dnn::Layer> class8_ab = net.getLayer("class8_ab");
|
||||
class8_ab->blobs.push_back(pts_in_hull);
|
||||
Ptr<dnn::Layer> conv8_313_rh = net.getLayer("conv8_313_rh");
|
||||
conv8_313_rh->blobs.push_back(Mat(1, 313, CV_32F, Scalar(2.606)));
|
||||
|
||||
// extract L channel and subtract mean
|
||||
Mat lab, L, input;
|
||||
img.convertTo(img, CV_32F, 1.0/255);
|
||||
cvtColor(img, lab, COLOR_BGR2Lab);
|
||||
extractChannel(lab, L, 0);
|
||||
resize(L, input, Size(W_in, H_in));
|
||||
input -= 50;
|
||||
|
||||
// run the L channel through the network
|
||||
Mat inputBlob = blobFromImage(input);
|
||||
net.setInput(inputBlob);
|
||||
Mat result = net.forward();
|
||||
|
||||
// retrieve the calculated a,b channels from the network output
|
||||
Size siz(result.size[2], result.size[3]);
|
||||
Mat a = Mat(siz, CV_32F, result.ptr(0,0));
|
||||
Mat b = Mat(siz, CV_32F, result.ptr(0,1));
|
||||
resize(a, a, img.size());
|
||||
resize(b, b, img.size());
|
||||
|
||||
// merge, and convert back to BGR
|
||||
Mat color, chn[] = {L, a, b};
|
||||
merge(chn, 3, lab);
|
||||
cvtColor(lab, color, COLOR_Lab2BGR);
|
||||
|
||||
imshow("color", color);
|
||||
imshow("original", img);
|
||||
waitKey();
|
||||
return 0;
|
||||
}
|
||||
69
samples/dnn/colorization.py
Normal file
69
samples/dnn/colorization.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# Script is based on https://github.com/richzhang/colorization/blob/master/colorization/colorize.py
|
||||
# To download the caffemodel and the prototxt, see: https://github.com/richzhang/colorization/tree/master/colorization/models
|
||||
# To download pts_in_hull.npy, see: https://github.com/richzhang/colorization/blob/master/colorization/resources/pts_in_hull.npy
|
||||
import numpy as np
|
||||
import argparse
|
||||
import cv2 as cv
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='iColor: deep interactive colorization')
|
||||
parser.add_argument('--input', help='Path to image or video. Skip to capture frames from camera')
|
||||
parser.add_argument('--prototxt', help='Path to colorization_deploy_v2.prototxt', required=True)
|
||||
parser.add_argument('--caffemodel', help='Path to colorization_release_v2.caffemodel', required=True)
|
||||
parser.add_argument('--kernel', help='Path to pts_in_hull.npy', required=True)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
if __name__ == '__main__':
|
||||
W_in = 224
|
||||
H_in = 224
|
||||
imshowSize = (640, 480)
|
||||
|
||||
args = parse_args()
|
||||
|
||||
# Select desired model
|
||||
net = cv.dnn.readNetFromCaffe(args.prototxt, args.caffemodel)
|
||||
|
||||
pts_in_hull = np.load(args.kernel) # load cluster centers
|
||||
|
||||
# populate cluster centers as 1x1 convolution kernel
|
||||
pts_in_hull = pts_in_hull.transpose().reshape(2, 313, 1, 1)
|
||||
net.getLayer(net.getLayerId('class8_ab')).blobs = [pts_in_hull.astype(np.float32)]
|
||||
net.getLayer(net.getLayerId('conv8_313_rh')).blobs = [np.full([1, 313], 2.606, np.float32)]
|
||||
|
||||
if args.input:
|
||||
cap = cv.VideoCapture(args.input)
|
||||
else:
|
||||
cap = cv.VideoCapture(0)
|
||||
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
cv.waitKey()
|
||||
break
|
||||
|
||||
img_rgb = (frame[:,:,[2, 1, 0]] * 1.0 / 255).astype(np.float32)
|
||||
|
||||
img_lab = cv.cvtColor(img_rgb, cv.COLOR_RGB2Lab)
|
||||
img_l = img_lab[:,:,0] # pull out L channel
|
||||
(H_orig,W_orig) = img_rgb.shape[:2] # original image size
|
||||
|
||||
# resize image to network input size
|
||||
img_rs = cv.resize(img_rgb, (W_in, H_in)) # resize image to network input size
|
||||
img_lab_rs = cv.cvtColor(img_rs, cv.COLOR_RGB2Lab)
|
||||
img_l_rs = img_lab_rs[:,:,0]
|
||||
img_l_rs -= 50 # subtract 50 for mean-centering
|
||||
|
||||
net.setInput(cv.dnn.blobFromImage(img_l_rs))
|
||||
ab_dec = net.forward()[0,:,:,:].transpose((1,2,0)) # this is our result
|
||||
|
||||
(H_out,W_out) = ab_dec.shape[:2]
|
||||
ab_dec_us = cv.resize(ab_dec, (W_orig, H_orig))
|
||||
img_lab_out = np.concatenate((img_l[:,:,np.newaxis],ab_dec_us),axis=2) # concatenate with original image L
|
||||
img_bgr_out = np.clip(cv.cvtColor(img_lab_out, cv.COLOR_Lab2BGR), 0, 1)
|
||||
|
||||
frame = cv.resize(frame, imshowSize)
|
||||
cv.imshow('origin', frame)
|
||||
cv.imshow('gray', cv.cvtColor(frame, cv.COLOR_RGB2GRAY))
|
||||
cv.imshow('colorized', cv.resize(img_bgr_out, imshowSize))
|
||||
95
samples/dnn/common.hpp
Normal file
95
samples/dnn/common.hpp
Normal file
@@ -0,0 +1,95 @@
|
||||
#include <opencv2/core/utils/filesystem.hpp>
|
||||
|
||||
using namespace cv;
|
||||
|
||||
std::string genArgument(const std::string& argName, const std::string& help,
|
||||
const std::string& modelName, const std::string& zooFile,
|
||||
char key = ' ', std::string defaultVal = "");
|
||||
|
||||
std::string genPreprocArguments(const std::string& modelName, const std::string& zooFile);
|
||||
|
||||
std::string findFile(const std::string& filename);
|
||||
|
||||
std::string genArgument(const std::string& argName, const std::string& help,
|
||||
const std::string& modelName, const std::string& zooFile,
|
||||
char key, std::string defaultVal)
|
||||
{
|
||||
if (!modelName.empty())
|
||||
{
|
||||
FileStorage fs(zooFile, FileStorage::READ);
|
||||
if (fs.isOpened())
|
||||
{
|
||||
FileNode node = fs[modelName];
|
||||
if (!node.empty())
|
||||
{
|
||||
FileNode value = node[argName];
|
||||
if (!value.empty())
|
||||
{
|
||||
if (value.isReal())
|
||||
defaultVal = format("%f", (float)value);
|
||||
else if (value.isString())
|
||||
defaultVal = (std::string)value;
|
||||
else if (value.isInt())
|
||||
defaultVal = format("%d", (int)value);
|
||||
else if (value.isSeq())
|
||||
{
|
||||
for (size_t i = 0; i < value.size(); ++i)
|
||||
{
|
||||
FileNode v = value[(int)i];
|
||||
if (v.isInt())
|
||||
defaultVal += format("%d ", (int)v);
|
||||
else if (v.isReal())
|
||||
defaultVal += format("%f ", (float)v);
|
||||
else
|
||||
CV_Error(Error::StsNotImplemented, "Unexpected value format");
|
||||
}
|
||||
}
|
||||
else
|
||||
CV_Error(Error::StsNotImplemented, "Unexpected field format");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return "{ " + argName + " " + key + " | " + defaultVal + " | " + help + " }";
|
||||
}
|
||||
|
||||
std::string findFile(const std::string& filename)
|
||||
{
|
||||
if (filename.empty() || utils::fs::exists(filename))
|
||||
return filename;
|
||||
|
||||
const char* extraPaths[] = {getenv("OPENCV_DNN_TEST_DATA_PATH"),
|
||||
getenv("OPENCV_TEST_DATA_PATH")};
|
||||
for (int i = 0; i < 2; ++i)
|
||||
{
|
||||
if (extraPaths[i] == NULL)
|
||||
continue;
|
||||
std::string absPath = utils::fs::join(extraPaths[i], utils::fs::join("dnn", filename));
|
||||
if (utils::fs::exists(absPath))
|
||||
return absPath;
|
||||
}
|
||||
CV_Error(Error::StsObjectNotFound, "File " + filename + " not found! "
|
||||
"Please specify a path to /opencv_extra/testdata in OPENCV_DNN_TEST_DATA_PATH "
|
||||
"environment variable or pass a full path to model.");
|
||||
}
|
||||
|
||||
std::string genPreprocArguments(const std::string& modelName, const std::string& zooFile)
|
||||
{
|
||||
return genArgument("model", "Path to a binary file of model contains trained weights. "
|
||||
"It could be a file with extensions .caffemodel (Caffe), "
|
||||
".pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet), .bin (OpenVINO).",
|
||||
modelName, zooFile, 'm') +
|
||||
genArgument("config", "Path to a text file of model contains network configuration. "
|
||||
"It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet), .xml (OpenVINO).",
|
||||
modelName, zooFile, 'c') +
|
||||
genArgument("mean", "Preprocess input image by subtracting mean values. Mean values should be in BGR order and delimited by spaces.",
|
||||
modelName, zooFile) +
|
||||
genArgument("scale", "Preprocess input image by multiplying on a scale factor.",
|
||||
modelName, zooFile, ' ', "1.0") +
|
||||
genArgument("width", "Preprocess input image by resizing to a specific width.",
|
||||
modelName, zooFile, ' ', "-1") +
|
||||
genArgument("height", "Preprocess input image by resizing to a specific height.",
|
||||
modelName, zooFile, ' ', "-1") +
|
||||
genArgument("rgb", "Indicate that model works with RGB input images instead BGR ones.",
|
||||
modelName, zooFile);
|
||||
}
|
||||
112
samples/dnn/common.py
Normal file
112
samples/dnn/common.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import sys
|
||||
import os
|
||||
import cv2 as cv
|
||||
|
||||
|
||||
def add_argument(zoo, parser, name, help, required=False, default=None, type=None, action=None, nargs=None):
|
||||
if len(sys.argv) <= 1:
|
||||
return
|
||||
|
||||
modelName = sys.argv[1]
|
||||
|
||||
if os.path.isfile(zoo):
|
||||
fs = cv.FileStorage(zoo, cv.FILE_STORAGE_READ)
|
||||
node = fs.getNode(modelName)
|
||||
if not node.empty():
|
||||
value = node.getNode(name)
|
||||
if not value.empty():
|
||||
if value.isReal():
|
||||
default = value.real()
|
||||
elif value.isString():
|
||||
default = value.string()
|
||||
elif value.isInt():
|
||||
default = int(value.real())
|
||||
elif value.isSeq():
|
||||
default = []
|
||||
for i in range(value.size()):
|
||||
v = value.at(i)
|
||||
if v.isInt():
|
||||
default.append(int(v.real()))
|
||||
elif v.isReal():
|
||||
default.append(v.real())
|
||||
else:
|
||||
print('Unexpected value format')
|
||||
exit(0)
|
||||
else:
|
||||
print('Unexpected field format')
|
||||
exit(0)
|
||||
required = False
|
||||
|
||||
if action == 'store_true':
|
||||
default = 1 if default == 'true' else (0 if default == 'false' else default)
|
||||
assert(default is None or default == 0 or default == 1)
|
||||
parser.add_argument('--' + name, required=required, help=help, default=bool(default),
|
||||
action=action)
|
||||
else:
|
||||
parser.add_argument('--' + name, required=required, help=help, default=default,
|
||||
action=action, nargs=nargs, type=type)
|
||||
|
||||
|
||||
def add_preproc_args(zoo, parser, sample):
|
||||
aliases = []
|
||||
if os.path.isfile(zoo):
|
||||
fs = cv.FileStorage(zoo, cv.FILE_STORAGE_READ)
|
||||
root = fs.root()
|
||||
for name in root.keys():
|
||||
model = root.getNode(name)
|
||||
if model.getNode('sample').string() == sample:
|
||||
aliases.append(name)
|
||||
|
||||
parser.add_argument('alias', nargs='?', choices=aliases,
|
||||
help='An alias name of model to extract preprocessing parameters from models.yml file.')
|
||||
add_argument(zoo, parser, 'model', required=True,
|
||||
help='Path to a binary file of model contains trained weights. '
|
||||
'It could be a file with extensions .caffemodel (Caffe), '
|
||||
'.pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet), .bin (OpenVINO)')
|
||||
add_argument(zoo, parser, 'config',
|
||||
help='Path to a text file of model contains network configuration. '
|
||||
'It could be a file with extensions .prototxt (Caffe), .pbtxt or .config (TensorFlow), .cfg (Darknet), .xml (OpenVINO)')
|
||||
add_argument(zoo, parser, 'mean', nargs='+', type=float, default=[0, 0, 0],
|
||||
help='Preprocess input image by subtracting mean values. '
|
||||
'Mean values should be in BGR order.')
|
||||
add_argument(zoo, parser, 'scale', type=float, default=1.0,
|
||||
help='Preprocess input image by multiplying on a scale factor.')
|
||||
add_argument(zoo, parser, 'width', type=int,
|
||||
help='Preprocess input image by resizing to a specific width.')
|
||||
add_argument(zoo, parser, 'height', type=int,
|
||||
help='Preprocess input image by resizing to a specific height.')
|
||||
add_argument(zoo, parser, 'rgb', action='store_true',
|
||||
help='Indicate that model works with RGB input images instead BGR ones.')
|
||||
add_argument(zoo, parser, 'classes',
|
||||
help='Optional path to a text file with names of classes to label detected objects.')
|
||||
|
||||
|
||||
def findFile(filename):
|
||||
if filename:
|
||||
if os.path.exists(filename):
|
||||
return filename
|
||||
|
||||
fpath = cv.samples.findFile(filename, False)
|
||||
if fpath:
|
||||
return fpath
|
||||
|
||||
samplesDataDir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||
'..',
|
||||
'data',
|
||||
'dnn')
|
||||
if os.path.exists(os.path.join(samplesDataDir, filename)):
|
||||
return os.path.join(samplesDataDir, filename)
|
||||
|
||||
for path in ['OPENCV_DNN_TEST_DATA_PATH', 'OPENCV_TEST_DATA_PATH']:
|
||||
try:
|
||||
extraPath = os.environ[path]
|
||||
absPath = os.path.join(extraPath, 'dnn', filename)
|
||||
if os.path.exists(absPath):
|
||||
return absPath
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
print('File ' + filename + ' not found! Please specify a path to '
|
||||
'/opencv_extra/testdata in OPENCV_DNN_TEST_DATA_PATH environment '
|
||||
'variable or pass a full path to model.')
|
||||
exit(0)
|
||||
283
samples/dnn/custom_layers.hpp
Normal file
283
samples/dnn/custom_layers.hpp
Normal file
@@ -0,0 +1,283 @@
|
||||
#ifndef __OPENCV_SAMPLES_DNN_CUSTOM_LAYERS__
|
||||
#define __OPENCV_SAMPLES_DNN_CUSTOM_LAYERS__
|
||||
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/dnn/shape_utils.hpp> // getPlane
|
||||
|
||||
//! [InterpLayer]
|
||||
class InterpLayer : public cv::dnn::Layer
|
||||
{
|
||||
public:
|
||||
InterpLayer(const cv::dnn::LayerParams ¶ms) : Layer(params)
|
||||
{
|
||||
outWidth = params.get<int>("width", 0);
|
||||
outHeight = params.get<int>("height", 0);
|
||||
}
|
||||
|
||||
static cv::Ptr<cv::dnn::Layer> create(cv::dnn::LayerParams& params)
|
||||
{
|
||||
return cv::Ptr<cv::dnn::Layer>(new InterpLayer(params));
|
||||
}
|
||||
|
||||
virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
|
||||
const int requiredOutputs,
|
||||
std::vector<std::vector<int> > &outputs,
|
||||
std::vector<std::vector<int> > &internals) const CV_OVERRIDE
|
||||
{
|
||||
CV_UNUSED(requiredOutputs); CV_UNUSED(internals);
|
||||
std::vector<int> outShape(4);
|
||||
outShape[0] = inputs[0][0]; // batch size
|
||||
outShape[1] = inputs[0][1]; // number of channels
|
||||
outShape[2] = outHeight;
|
||||
outShape[3] = outWidth;
|
||||
outputs.assign(1, outShape);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Implementation of this custom layer is based on https://github.com/cdmh/deeplab-public/blob/master/src/caffe/layers/interp_layer.cpp
|
||||
virtual void forward(cv::InputArrayOfArrays inputs_arr,
|
||||
cv::OutputArrayOfArrays outputs_arr,
|
||||
cv::OutputArrayOfArrays internals_arr) CV_OVERRIDE
|
||||
{
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
{
|
||||
// In case of DNN_TARGET_OPENCL_FP16 target the following method
|
||||
// converts data from FP16 to FP32 and calls this forward again.
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<cv::Mat> inputs, outputs;
|
||||
inputs_arr.getMatVector(inputs);
|
||||
outputs_arr.getMatVector(outputs);
|
||||
|
||||
cv::Mat& inp = inputs[0];
|
||||
cv::Mat& out = outputs[0];
|
||||
const float* inpData = (float*)inp.data;
|
||||
float* outData = (float*)out.data;
|
||||
|
||||
const int batchSize = inp.size[0];
|
||||
const int numChannels = inp.size[1];
|
||||
const int inpHeight = inp.size[2];
|
||||
const int inpWidth = inp.size[3];
|
||||
|
||||
const float rheight = (outHeight > 1) ? static_cast<float>(inpHeight - 1) / (outHeight - 1) : 0.f;
|
||||
const float rwidth = (outWidth > 1) ? static_cast<float>(inpWidth - 1) / (outWidth - 1) : 0.f;
|
||||
for (int h2 = 0; h2 < outHeight; ++h2)
|
||||
{
|
||||
const float h1r = rheight * h2;
|
||||
const int h1 = static_cast<int>(h1r);
|
||||
const int h1p = (h1 < inpHeight - 1) ? 1 : 0;
|
||||
const float h1lambda = h1r - h1;
|
||||
const float h0lambda = 1.f - h1lambda;
|
||||
for (int w2 = 0; w2 < outWidth; ++w2)
|
||||
{
|
||||
const float w1r = rwidth * w2;
|
||||
const int w1 = static_cast<int>(w1r);
|
||||
const int w1p = (w1 < inpWidth - 1) ? 1 : 0;
|
||||
const float w1lambda = w1r - w1;
|
||||
const float w0lambda = 1.f - w1lambda;
|
||||
const float* pos1 = inpData + h1 * inpWidth + w1;
|
||||
float* pos2 = outData + h2 * outWidth + w2;
|
||||
for (int c = 0; c < batchSize * numChannels; ++c)
|
||||
{
|
||||
pos2[0] =
|
||||
h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) +
|
||||
h1lambda * (w0lambda * pos1[h1p * inpWidth] + w1lambda * pos1[h1p * inpWidth + w1p]);
|
||||
pos1 += inpWidth * inpHeight;
|
||||
pos2 += outWidth * outHeight;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int outWidth, outHeight;
|
||||
};
|
||||
//! [InterpLayer]
|
||||
|
||||
//! [ResizeBilinearLayer]
|
||||
class ResizeBilinearLayer CV_FINAL : public cv::dnn::Layer
|
||||
{
|
||||
public:
|
||||
ResizeBilinearLayer(const cv::dnn::LayerParams ¶ms) : Layer(params)
|
||||
{
|
||||
CV_Assert(!params.get<bool>("align_corners", false));
|
||||
CV_Assert(!blobs.empty());
|
||||
|
||||
for (size_t i = 0; i < blobs.size(); ++i)
|
||||
CV_Assert(blobs[i].type() == CV_32SC1);
|
||||
|
||||
// There are two cases of input blob: a single blob which contains output
|
||||
// shape and two blobs with scaling factors.
|
||||
if (blobs.size() == 1)
|
||||
{
|
||||
CV_Assert(blobs[0].total() == 2);
|
||||
outHeight = blobs[0].at<int>(0, 0);
|
||||
outWidth = blobs[0].at<int>(0, 1);
|
||||
factorHeight = factorWidth = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_Assert(blobs.size() == 2); CV_Assert(blobs[0].total() == 1); CV_Assert(blobs[1].total() == 1);
|
||||
factorHeight = blobs[0].at<int>(0, 0);
|
||||
factorWidth = blobs[1].at<int>(0, 0);
|
||||
outHeight = outWidth = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static cv::Ptr<cv::dnn::Layer> create(cv::dnn::LayerParams& params)
|
||||
{
|
||||
return cv::Ptr<cv::dnn::Layer>(new ResizeBilinearLayer(params));
|
||||
}
|
||||
|
||||
virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
|
||||
const int,
|
||||
std::vector<std::vector<int> > &outputs,
|
||||
std::vector<std::vector<int> > &) const CV_OVERRIDE
|
||||
{
|
||||
std::vector<int> outShape(4);
|
||||
outShape[0] = inputs[0][0]; // batch size
|
||||
outShape[1] = inputs[0][1]; // number of channels
|
||||
outShape[2] = outHeight != 0 ? outHeight : (inputs[0][2] * factorHeight);
|
||||
outShape[3] = outWidth != 0 ? outWidth : (inputs[0][3] * factorWidth);
|
||||
outputs.assign(1, outShape);
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual void finalize(cv::InputArrayOfArrays, cv::OutputArrayOfArrays outputs_arr) CV_OVERRIDE
|
||||
{
|
||||
std::vector<cv::Mat> outputs;
|
||||
outputs_arr.getMatVector(outputs);
|
||||
if (!outWidth && !outHeight)
|
||||
{
|
||||
outHeight = outputs[0].size[2];
|
||||
outWidth = outputs[0].size[3];
|
||||
}
|
||||
}
|
||||
|
||||
// This implementation is based on a reference implementation from
|
||||
// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
|
||||
virtual void forward(cv::InputArrayOfArrays inputs_arr,
|
||||
cv::OutputArrayOfArrays outputs_arr,
|
||||
cv::OutputArrayOfArrays internals_arr) CV_OVERRIDE
|
||||
{
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
{
|
||||
// In case of DNN_TARGET_OPENCL_FP16 target the following method
|
||||
// converts data from FP16 to FP32 and calls this forward again.
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<cv::Mat> inputs, outputs;
|
||||
inputs_arr.getMatVector(inputs);
|
||||
outputs_arr.getMatVector(outputs);
|
||||
|
||||
cv::Mat& inp = inputs[0];
|
||||
cv::Mat& out = outputs[0];
|
||||
const float* inpData = (float*)inp.data;
|
||||
float* outData = (float*)out.data;
|
||||
|
||||
const int batchSize = inp.size[0];
|
||||
const int numChannels = inp.size[1];
|
||||
const int inpHeight = inp.size[2];
|
||||
const int inpWidth = inp.size[3];
|
||||
|
||||
float heightScale = static_cast<float>(inpHeight) / outHeight;
|
||||
float widthScale = static_cast<float>(inpWidth) / outWidth;
|
||||
for (int b = 0; b < batchSize; ++b)
|
||||
{
|
||||
for (int y = 0; y < outHeight; ++y)
|
||||
{
|
||||
float input_y = y * heightScale;
|
||||
int y0 = static_cast<int>(std::floor(input_y));
|
||||
int y1 = std::min(y0 + 1, inpHeight - 1);
|
||||
for (int x = 0; x < outWidth; ++x)
|
||||
{
|
||||
float input_x = x * widthScale;
|
||||
int x0 = static_cast<int>(std::floor(input_x));
|
||||
int x1 = std::min(x0 + 1, inpWidth - 1);
|
||||
for (int c = 0; c < numChannels; ++c)
|
||||
{
|
||||
float interpolation =
|
||||
inpData[offset(inp.size, c, x0, y0, b)] * (1 - (input_y - y0)) * (1 - (input_x - x0)) +
|
||||
inpData[offset(inp.size, c, x0, y1, b)] * (input_y - y0) * (1 - (input_x - x0)) +
|
||||
inpData[offset(inp.size, c, x1, y0, b)] * (1 - (input_y - y0)) * (input_x - x0) +
|
||||
inpData[offset(inp.size, c, x1, y1, b)] * (input_y - y0) * (input_x - x0);
|
||||
outData[offset(out.size, c, x, y, b)] = interpolation;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
static inline int offset(const cv::MatSize& size, int c, int x, int y, int b)
|
||||
{
|
||||
return x + size[3] * (y + size[2] * (c + size[1] * b));
|
||||
}
|
||||
|
||||
int outWidth, outHeight, factorWidth, factorHeight;
|
||||
};
|
||||
//! [ResizeBilinearLayer]
|
||||
|
||||
//
|
||||
// The following code is used only to generate tutorials documentation.
|
||||
//
|
||||
|
||||
//! [A custom layer interface]
|
||||
class MyLayer : public cv::dnn::Layer
|
||||
{
|
||||
public:
|
||||
//! [MyLayer::MyLayer]
|
||||
MyLayer(const cv::dnn::LayerParams ¶ms);
|
||||
//! [MyLayer::MyLayer]
|
||||
|
||||
//! [MyLayer::create]
|
||||
static cv::Ptr<cv::dnn::Layer> create(cv::dnn::LayerParams& params);
|
||||
//! [MyLayer::create]
|
||||
|
||||
//! [MyLayer::getMemoryShapes]
|
||||
virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
|
||||
const int requiredOutputs,
|
||||
std::vector<std::vector<int> > &outputs,
|
||||
std::vector<std::vector<int> > &internals) const CV_OVERRIDE;
|
||||
//! [MyLayer::getMemoryShapes]
|
||||
|
||||
//! [MyLayer::forward]
|
||||
virtual void forward(cv::InputArrayOfArrays inputs,
|
||||
cv::OutputArrayOfArrays outputs,
|
||||
cv::OutputArrayOfArrays internals) CV_OVERRIDE;
|
||||
//! [MyLayer::forward]
|
||||
|
||||
//! [MyLayer::finalize]
|
||||
virtual void finalize(cv::InputArrayOfArrays inputs,
|
||||
cv::OutputArrayOfArrays outputs) CV_OVERRIDE;
|
||||
//! [MyLayer::finalize]
|
||||
};
|
||||
//! [A custom layer interface]
|
||||
|
||||
//! [Register a custom layer]
|
||||
#include <opencv2/dnn/layer.details.hpp> // CV_DNN_REGISTER_LAYER_CLASS
|
||||
|
||||
static inline void loadNet()
|
||||
{
|
||||
CV_DNN_REGISTER_LAYER_CLASS(Interp, InterpLayer);
|
||||
// ...
|
||||
//! [Register a custom layer]
|
||||
|
||||
//! [Register InterpLayer]
|
||||
CV_DNN_REGISTER_LAYER_CLASS(Interp, InterpLayer);
|
||||
cv::dnn::Net caffeNet = cv::dnn::readNet("/path/to/config.prototxt", "/path/to/weights.caffemodel");
|
||||
//! [Register InterpLayer]
|
||||
|
||||
//! [Register ResizeBilinearLayer]
|
||||
CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer);
|
||||
cv::dnn::Net tfNet = cv::dnn::readNet("/path/to/graph.pb");
|
||||
//! [Register ResizeBilinearLayer]
|
||||
|
||||
if (false) loadNet(); // To prevent unused function warning.
|
||||
}
|
||||
|
||||
#endif // __OPENCV_SAMPLES_DNN_CUSTOM_LAYERS__
|
||||
518
samples/dnn/dasiamrpn_tracker.cpp
Normal file
518
samples/dnn/dasiamrpn_tracker.cpp
Normal file
@@ -0,0 +1,518 @@
|
||||
// DaSiamRPN tracker.
|
||||
// Original paper: https://arxiv.org/abs/1808.06048
|
||||
// Link to original repo: https://github.com/foolwood/DaSiamRPN
|
||||
// Links to onnx models:
|
||||
// - network: https://www.dropbox.com/s/rr1lk9355vzolqv/dasiamrpn_model.onnx?dl=0
|
||||
// - kernel_r1: https://www.dropbox.com/s/999cqx5zrfi7w4p/dasiamrpn_kernel_r1.onnx?dl=0
|
||||
// - kernel_cls1: https://www.dropbox.com/s/qvmtszx5h339a0w/dasiamrpn_kernel_cls1.onnx?dl=0
|
||||
|
||||
#include <iostream>
|
||||
#include <cmath>
|
||||
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::dnn;
|
||||
|
||||
const char *keys =
|
||||
"{ help h | | Print help message }"
|
||||
"{ input i | | Full path to input video folder, the specific camera index. (empty for camera 0) }"
|
||||
"{ net | dasiamrpn_model.onnx | Path to onnx model of net}"
|
||||
"{ kernel_cls1 | dasiamrpn_kernel_cls1.onnx | Path to onnx model of kernel_r1 }"
|
||||
"{ kernel_r1 | dasiamrpn_kernel_r1.onnx | Path to onnx model of kernel_cls1 }"
|
||||
"{ backend | 0 | Choose one of computation backends: "
|
||||
"0: automatically (by default), "
|
||||
"1: Halide language (http://halide-lang.org/), "
|
||||
"2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"3: OpenCV implementation }"
|
||||
"{ target | 0 | Choose one of target computation devices: "
|
||||
"0: CPU target (by default), "
|
||||
"1: OpenCL, "
|
||||
"2: OpenCL fp16 (half-float precision), "
|
||||
"3: VPU }"
|
||||
;
|
||||
|
||||
// Initial parameters of the model
|
||||
struct trackerConfig
|
||||
{
|
||||
float windowInfluence = 0.43f;
|
||||
float lr = 0.4f;
|
||||
int scale = 8;
|
||||
bool swapRB = false;
|
||||
int totalStride = 8;
|
||||
float penaltyK = 0.055f;
|
||||
int exemplarSize = 127;
|
||||
int instanceSize = 271;
|
||||
float contextAmount = 0.5f;
|
||||
std::vector<float> ratios = { 0.33f, 0.5f, 1.0f, 2.0f, 3.0f };
|
||||
int anchorNum = int(ratios.size());
|
||||
Mat anchors;
|
||||
Mat windows;
|
||||
Scalar avgChans;
|
||||
Size imgSize = { 0, 0 };
|
||||
Rect2f targetBox = { 0, 0, 0, 0 };
|
||||
int scoreSize = (instanceSize - exemplarSize) / totalStride + 1;
|
||||
|
||||
void update_scoreSize()
|
||||
{
|
||||
scoreSize = int((instanceSize - exemplarSize) / totalStride + 1);
|
||||
}
|
||||
};
|
||||
|
||||
static void softmax(const Mat& src, Mat& dst);
|
||||
static void elementMax(Mat& src);
|
||||
static Mat generateHanningWindow(const trackerConfig& trackState);
|
||||
static Mat generateAnchors(trackerConfig& trackState);
|
||||
static Mat getSubwindow(Mat& img, const Rect2f& targetBox, float originalSize, Scalar avgChans);
|
||||
static float trackerEval(Mat img, trackerConfig& trackState, Net& siamRPN);
|
||||
static void trackerInit(Mat img, trackerConfig& trackState, Net& siamRPN, Net& siamKernelR1, Net& siamKernelCL1);
|
||||
|
||||
template <typename T> static
|
||||
T sizeCal(const T& w, const T& h)
|
||||
{
|
||||
T pad = (w + h) * T(0.5);
|
||||
T sz2 = (w + pad) * (h + pad);
|
||||
return sqrt(sz2);
|
||||
}
|
||||
|
||||
template <>
|
||||
Mat sizeCal(const Mat& w, const Mat& h)
|
||||
{
|
||||
Mat pad = (w + h) * 0.5;
|
||||
Mat sz2 = (w + pad).mul((h + pad));
|
||||
|
||||
cv::sqrt(sz2, sz2);
|
||||
return sz2;
|
||||
}
|
||||
|
||||
static
|
||||
int run(int argc, char** argv)
|
||||
{
|
||||
// Parse command line arguments.
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
|
||||
if (parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::string inputName = parser.get<String>("input");
|
||||
std::string net = parser.get<String>("net");
|
||||
std::string kernel_cls1 = parser.get<String>("kernel_cls1");
|
||||
std::string kernel_r1 = parser.get<String>("kernel_r1");
|
||||
int backend = parser.get<int>("backend");
|
||||
int target = parser.get<int>("target");
|
||||
|
||||
// Read nets.
|
||||
Net siamRPN, siamKernelCL1, siamKernelR1;
|
||||
try
|
||||
{
|
||||
siamRPN = readNet(samples::findFile(net));
|
||||
siamKernelCL1 = readNet(samples::findFile(kernel_cls1));
|
||||
siamKernelR1 = readNet(samples::findFile(kernel_r1));
|
||||
}
|
||||
catch (const cv::Exception& ee)
|
||||
{
|
||||
std::cerr << "Exception: " << ee.what() << std::endl;
|
||||
std::cout << "Can't load the network by using the following files:" << std::endl;
|
||||
std::cout << "siamRPN : " << net << std::endl;
|
||||
std::cout << "siamKernelCL1 : " << kernel_cls1 << std::endl;
|
||||
std::cout << "siamKernelR1 : " << kernel_r1 << std::endl;
|
||||
return 2;
|
||||
}
|
||||
|
||||
// Set model backend.
|
||||
siamRPN.setPreferableBackend(backend);
|
||||
siamRPN.setPreferableTarget(target);
|
||||
siamKernelR1.setPreferableBackend(backend);
|
||||
siamKernelR1.setPreferableTarget(target);
|
||||
siamKernelCL1.setPreferableBackend(backend);
|
||||
siamKernelCL1.setPreferableTarget(target);
|
||||
|
||||
const std::string winName = "DaSiamRPN";
|
||||
namedWindow(winName, WINDOW_AUTOSIZE);
|
||||
|
||||
// Open a video file or an image file or a camera stream.
|
||||
VideoCapture cap;
|
||||
|
||||
if (inputName.empty() || (isdigit(inputName[0]) && inputName.size() == 1))
|
||||
{
|
||||
int c = inputName.empty() ? 0 : inputName[0] - '0';
|
||||
std::cout << "Trying to open camera #" << c << " ..." << std::endl;
|
||||
if (!cap.open(c))
|
||||
{
|
||||
std::cout << "Capture from camera #" << c << " didn't work. Specify -i=<video> parameter to read from video file" << std::endl;
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
else if (inputName.size())
|
||||
{
|
||||
inputName = samples::findFileOrKeep(inputName);
|
||||
if (!cap.open(inputName))
|
||||
{
|
||||
std::cout << "Could not open: " << inputName << std::endl;
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Read the first image.
|
||||
Mat image;
|
||||
cap >> image;
|
||||
if (image.empty())
|
||||
{
|
||||
std::cerr << "Can't capture frame!" << std::endl;
|
||||
return 2;
|
||||
}
|
||||
|
||||
Mat image_select = image.clone();
|
||||
putText(image_select, "Select initial bounding box you want to track.", Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
putText(image_select, "And Press the ENTER key.", Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
|
||||
Rect selectRect = selectROI(winName, image_select);
|
||||
std::cout << "ROI=" << selectRect << std::endl;
|
||||
|
||||
trackerConfig trackState;
|
||||
trackState.update_scoreSize();
|
||||
trackState.targetBox = Rect2f(
|
||||
float(selectRect.x) + float(selectRect.width) * 0.5f, // FIXIT don't use center in Rect structures, it is confusing
|
||||
float(selectRect.y) + float(selectRect.height) * 0.5f,
|
||||
float(selectRect.width),
|
||||
float(selectRect.height)
|
||||
);
|
||||
|
||||
// Set tracking template.
|
||||
trackerInit(image, trackState, siamRPN, siamKernelR1, siamKernelCL1);
|
||||
|
||||
TickMeter tickMeter;
|
||||
|
||||
for (int count = 0; ; ++count)
|
||||
{
|
||||
cap >> image;
|
||||
if (image.empty())
|
||||
{
|
||||
std::cerr << "Can't capture frame " << count << ". End of video stream?" << std::endl;
|
||||
break;
|
||||
}
|
||||
|
||||
tickMeter.start();
|
||||
float score = trackerEval(image, trackState, siamRPN);
|
||||
tickMeter.stop();
|
||||
|
||||
Rect rect = {
|
||||
int(trackState.targetBox.x - int(trackState.targetBox.width / 2)),
|
||||
int(trackState.targetBox.y - int(trackState.targetBox.height / 2)),
|
||||
int(trackState.targetBox.width),
|
||||
int(trackState.targetBox.height)
|
||||
};
|
||||
std::cout << "frame " << count <<
|
||||
": predicted score=" << score <<
|
||||
" rect=" << rect <<
|
||||
" time=" << tickMeter.getTimeMilli() << "ms" <<
|
||||
std::endl;
|
||||
|
||||
Mat render_image = image.clone();
|
||||
rectangle(render_image, rect, Scalar(0, 255, 0), 2);
|
||||
|
||||
std::string timeLabel = format("Inference time: %.2f ms", tickMeter.getTimeMilli());
|
||||
std::string scoreLabel = format("Score: %f", score);
|
||||
putText(render_image, timeLabel, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
putText(render_image, scoreLabel, Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
|
||||
imshow(winName, render_image);
|
||||
|
||||
tickMeter.reset();
|
||||
|
||||
int c = waitKey(1);
|
||||
if (c == 27 /*ESC*/)
|
||||
break;
|
||||
}
|
||||
|
||||
std::cout << "Exit" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
Mat generateHanningWindow(const trackerConfig& trackState)
|
||||
{
|
||||
Mat baseWindows, HanningWindows;
|
||||
|
||||
createHanningWindow(baseWindows, Size(trackState.scoreSize, trackState.scoreSize), CV_32F);
|
||||
baseWindows = baseWindows.reshape(0, { 1, trackState.scoreSize, trackState.scoreSize });
|
||||
HanningWindows = baseWindows.clone();
|
||||
for (int i = 1; i < trackState.anchorNum; i++)
|
||||
{
|
||||
HanningWindows.push_back(baseWindows);
|
||||
}
|
||||
|
||||
return HanningWindows;
|
||||
}
|
||||
|
||||
Mat generateAnchors(trackerConfig& trackState)
|
||||
{
|
||||
int totalStride = trackState.totalStride, scales = trackState.scale, scoreSize = trackState.scoreSize;
|
||||
std::vector<float> ratios = trackState.ratios;
|
||||
std::vector<Rect2f> baseAnchors;
|
||||
int anchorNum = int(ratios.size());
|
||||
int size = totalStride * totalStride;
|
||||
|
||||
float ori = -(float(scoreSize / 2)) * float(totalStride);
|
||||
|
||||
for (auto i = 0; i < anchorNum; i++)
|
||||
{
|
||||
int ws = int(sqrt(size / ratios[i]));
|
||||
int hs = int(ws * ratios[i]);
|
||||
|
||||
float wws = float(ws) * scales;
|
||||
float hhs = float(hs) * scales;
|
||||
Rect2f anchor = { 0, 0, wws, hhs };
|
||||
baseAnchors.push_back(anchor);
|
||||
}
|
||||
|
||||
int anchorIndex[] = { 0, 0, 0, 0 };
|
||||
const int sizes[] = { 4, (int)ratios.size(), scoreSize, scoreSize };
|
||||
Mat anchors(4, sizes, CV_32F);
|
||||
|
||||
for (auto i = 0; i < scoreSize; i++)
|
||||
{
|
||||
for (auto j = 0; j < scoreSize; j++)
|
||||
{
|
||||
for (auto k = 0; k < anchorNum; k++)
|
||||
{
|
||||
anchorIndex[0] = 1, anchorIndex[1] = k, anchorIndex[2] = i, anchorIndex[3] = j;
|
||||
anchors.at<float>(anchorIndex) = ori + totalStride * i;
|
||||
|
||||
anchorIndex[0] = 0;
|
||||
anchors.at<float>(anchorIndex) = ori + totalStride * j;
|
||||
|
||||
anchorIndex[0] = 2;
|
||||
anchors.at<float>(anchorIndex) = baseAnchors[k].width;
|
||||
|
||||
anchorIndex[0] = 3;
|
||||
anchors.at<float>(anchorIndex) = baseAnchors[k].height;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return anchors;
|
||||
}
|
||||
|
||||
Mat getSubwindow(Mat& img, const Rect2f& targetBox, float originalSize, Scalar avgChans)
|
||||
{
|
||||
Mat zCrop, dst;
|
||||
Size imgSize = img.size();
|
||||
float c = (originalSize + 1) / 2;
|
||||
float xMin = (float)cvRound(targetBox.x - c);
|
||||
float xMax = xMin + originalSize - 1;
|
||||
float yMin = (float)cvRound(targetBox.y - c);
|
||||
float yMax = yMin + originalSize - 1;
|
||||
|
||||
int leftPad = (int)(fmax(0., -xMin));
|
||||
int topPad = (int)(fmax(0., -yMin));
|
||||
int rightPad = (int)(fmax(0., xMax - imgSize.width + 1));
|
||||
int bottomPad = (int)(fmax(0., yMax - imgSize.height + 1));
|
||||
|
||||
xMin = xMin + leftPad;
|
||||
xMax = xMax + leftPad;
|
||||
yMax = yMax + topPad;
|
||||
yMin = yMin + topPad;
|
||||
|
||||
if (topPad == 0 && bottomPad == 0 && leftPad == 0 && rightPad == 0)
|
||||
{
|
||||
img(Rect(int(xMin), int(yMin), int(xMax - xMin + 1), int(yMax - yMin + 1))).copyTo(zCrop);
|
||||
}
|
||||
else
|
||||
{
|
||||
copyMakeBorder(img, dst, topPad, bottomPad, leftPad, rightPad, BORDER_CONSTANT, avgChans);
|
||||
dst(Rect(int(xMin), int(yMin), int(xMax - xMin + 1), int(yMax - yMin + 1))).copyTo(zCrop);
|
||||
}
|
||||
|
||||
return zCrop;
|
||||
}
|
||||
|
||||
void softmax(const Mat& src, Mat& dst)
|
||||
{
|
||||
Mat maxVal;
|
||||
cv::max(src.row(1), src.row(0), maxVal);
|
||||
|
||||
src.row(1) -= maxVal;
|
||||
src.row(0) -= maxVal;
|
||||
|
||||
exp(src, dst);
|
||||
|
||||
Mat sumVal = dst.row(0) + dst.row(1);
|
||||
dst.row(0) = dst.row(0) / sumVal;
|
||||
dst.row(1) = dst.row(1) / sumVal;
|
||||
}
|
||||
|
||||
void elementMax(Mat& src)
|
||||
{
|
||||
int* p = src.size.p;
|
||||
int index[] = { 0, 0, 0, 0 };
|
||||
for (int n = 0; n < *p; n++)
|
||||
{
|
||||
for (int k = 0; k < *(p + 1); k++)
|
||||
{
|
||||
for (int i = 0; i < *(p + 2); i++)
|
||||
{
|
||||
for (int j = 0; j < *(p + 3); j++)
|
||||
{
|
||||
index[0] = n, index[1] = k, index[2] = i, index[3] = j;
|
||||
float& v = src.at<float>(index);
|
||||
v = fmax(v, 1.0f / v);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float trackerEval(Mat img, trackerConfig& trackState, Net& siamRPN)
|
||||
{
|
||||
Rect2f targetBox = trackState.targetBox;
|
||||
|
||||
float wc = targetBox.height + trackState.contextAmount * (targetBox.width + targetBox.height);
|
||||
float hc = targetBox.width + trackState.contextAmount * (targetBox.width + targetBox.height);
|
||||
|
||||
float sz = sqrt(wc * hc);
|
||||
float scaleZ = trackState.exemplarSize / sz;
|
||||
|
||||
float searchSize = float((trackState.instanceSize - trackState.exemplarSize) / 2);
|
||||
float pad = searchSize / scaleZ;
|
||||
float sx = sz + 2 * pad;
|
||||
|
||||
Mat xCrop = getSubwindow(img, targetBox, (float)cvRound(sx), trackState.avgChans);
|
||||
|
||||
static Mat blob;
|
||||
std::vector<Mat> outs;
|
||||
std::vector<String> outNames;
|
||||
Mat delta, score;
|
||||
Mat sc, rc, penalty, pscore;
|
||||
|
||||
blobFromImage(xCrop, blob, 1.0, Size(trackState.instanceSize, trackState.instanceSize), Scalar(), trackState.swapRB, false, CV_32F);
|
||||
|
||||
siamRPN.setInput(blob);
|
||||
|
||||
outNames = siamRPN.getUnconnectedOutLayersNames();
|
||||
siamRPN.forward(outs, outNames);
|
||||
|
||||
delta = outs[0];
|
||||
score = outs[1];
|
||||
|
||||
score = score.reshape(0, { 2, trackState.anchorNum, trackState.scoreSize, trackState.scoreSize });
|
||||
delta = delta.reshape(0, { 4, trackState.anchorNum, trackState.scoreSize, trackState.scoreSize });
|
||||
|
||||
softmax(score, score);
|
||||
|
||||
targetBox.width *= scaleZ;
|
||||
targetBox.height *= scaleZ;
|
||||
|
||||
score = score.row(1);
|
||||
score = score.reshape(0, { 5, 19, 19 });
|
||||
|
||||
// Post processing
|
||||
delta.row(0) = delta.row(0).mul(trackState.anchors.row(2)) + trackState.anchors.row(0);
|
||||
delta.row(1) = delta.row(1).mul(trackState.anchors.row(3)) + trackState.anchors.row(1);
|
||||
exp(delta.row(2), delta.row(2));
|
||||
delta.row(2) = delta.row(2).mul(trackState.anchors.row(2));
|
||||
exp(delta.row(3), delta.row(3));
|
||||
delta.row(3) = delta.row(3).mul(trackState.anchors.row(3));
|
||||
|
||||
sc = sizeCal(delta.row(2), delta.row(3)) / sizeCal(targetBox.width, targetBox.height);
|
||||
elementMax(sc);
|
||||
|
||||
rc = delta.row(2).mul(1 / delta.row(3));
|
||||
rc = (targetBox.width / targetBox.height) / rc;
|
||||
elementMax(rc);
|
||||
|
||||
// Calculating the penalty
|
||||
exp(((rc.mul(sc) - 1.) * trackState.penaltyK * (-1.0)), penalty);
|
||||
penalty = penalty.reshape(0, { trackState.anchorNum, trackState.scoreSize, trackState.scoreSize });
|
||||
|
||||
pscore = penalty.mul(score);
|
||||
pscore = pscore * (1.0 - trackState.windowInfluence) + trackState.windows * trackState.windowInfluence;
|
||||
|
||||
int bestID[] = { 0 };
|
||||
// Find the index of best score.
|
||||
minMaxIdx(pscore.reshape(0, { trackState.anchorNum * trackState.scoreSize * trackState.scoreSize, 1 }), 0, 0, 0, bestID);
|
||||
delta = delta.reshape(0, { 4, trackState.anchorNum * trackState.scoreSize * trackState.scoreSize });
|
||||
penalty = penalty.reshape(0, { trackState.anchorNum * trackState.scoreSize * trackState.scoreSize, 1 });
|
||||
score = score.reshape(0, { trackState.anchorNum * trackState.scoreSize * trackState.scoreSize, 1 });
|
||||
|
||||
int index[] = { 0, bestID[0] };
|
||||
Rect2f resBox = { 0, 0, 0, 0 };
|
||||
|
||||
resBox.x = delta.at<float>(index) / scaleZ;
|
||||
index[0] = 1;
|
||||
resBox.y = delta.at<float>(index) / scaleZ;
|
||||
index[0] = 2;
|
||||
resBox.width = delta.at<float>(index) / scaleZ;
|
||||
index[0] = 3;
|
||||
resBox.height = delta.at<float>(index) / scaleZ;
|
||||
|
||||
float lr = penalty.at<float>(bestID) * score.at<float>(bestID) * trackState.lr;
|
||||
|
||||
resBox.x = resBox.x + targetBox.x;
|
||||
resBox.y = resBox.y + targetBox.y;
|
||||
targetBox.width /= scaleZ;
|
||||
targetBox.height /= scaleZ;
|
||||
|
||||
resBox.width = targetBox.width * (1 - lr) + resBox.width * lr;
|
||||
resBox.height = targetBox.height * (1 - lr) + resBox.height * lr;
|
||||
|
||||
resBox.x = float(fmax(0., fmin(float(trackState.imgSize.width), resBox.x)));
|
||||
resBox.y = float(fmax(0., fmin(float(trackState.imgSize.height), resBox.y)));
|
||||
resBox.width = float(fmax(10., fmin(float(trackState.imgSize.width), resBox.width)));
|
||||
resBox.height = float(fmax(10., fmin(float(trackState.imgSize.height), resBox.height)));
|
||||
|
||||
trackState.targetBox = resBox;
|
||||
return score.at<float>(bestID);
|
||||
}
|
||||
|
||||
void trackerInit(Mat img, trackerConfig& trackState, Net& siamRPN, Net& siamKernelR1, Net& siamKernelCL1)
|
||||
{
|
||||
Rect2f targetBox = trackState.targetBox;
|
||||
Mat anchors = generateAnchors(trackState);
|
||||
trackState.anchors = anchors;
|
||||
|
||||
Mat windows = generateHanningWindow(trackState);
|
||||
|
||||
trackState.windows = windows;
|
||||
trackState.imgSize = img.size();
|
||||
|
||||
trackState.avgChans = mean(img);
|
||||
float wc = targetBox.width + trackState.contextAmount * (targetBox.width + targetBox.height);
|
||||
float hc = targetBox.height + trackState.contextAmount * (targetBox.width + targetBox.height);
|
||||
float sz = (float)cvRound(sqrt(wc * hc));
|
||||
|
||||
Mat zCrop = getSubwindow(img, targetBox, sz, trackState.avgChans);
|
||||
static Mat blob;
|
||||
|
||||
blobFromImage(zCrop, blob, 1.0, Size(trackState.exemplarSize, trackState.exemplarSize), Scalar(), trackState.swapRB, false, CV_32F);
|
||||
siamRPN.setInput(blob);
|
||||
Mat out1;
|
||||
siamRPN.forward(out1, "63");
|
||||
|
||||
siamKernelCL1.setInput(out1);
|
||||
siamKernelR1.setInput(out1);
|
||||
|
||||
Mat cls1 = siamKernelCL1.forward();
|
||||
Mat r1 = siamKernelR1.forward();
|
||||
std::vector<int> r1_shape = { 20, 256, 4, 4 }, cls1_shape = { 10, 256, 4, 4 };
|
||||
|
||||
siamRPN.setParam(siamRPN.getLayerId("65"), 0, r1.reshape(0, r1_shape));
|
||||
siamRPN.setParam(siamRPN.getLayerId("68"), 0, cls1.reshape(0, cls1_shape));
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
try
|
||||
{
|
||||
return run(argc, argv);
|
||||
}
|
||||
catch (const std::exception& e)
|
||||
{
|
||||
std::cerr << "FATAL: C++ exception: " << e.what() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
291
samples/dnn/dasiamrpn_tracker.py
Normal file
291
samples/dnn/dasiamrpn_tracker.py
Normal file
@@ -0,0 +1,291 @@
|
||||
"""
|
||||
DaSiamRPN tracker.
|
||||
Original paper: https://arxiv.org/abs/1808.06048
|
||||
Link to original repo: https://github.com/foolwood/DaSiamRPN
|
||||
Links to onnx models:
|
||||
network: https://www.dropbox.com/s/rr1lk9355vzolqv/dasiamrpn_model.onnx?dl=0
|
||||
kernel_r1: https://www.dropbox.com/s/999cqx5zrfi7w4p/dasiamrpn_kernel_r1.onnx?dl=0
|
||||
kernel_cls1: https://www.dropbox.com/s/qvmtszx5h339a0w/dasiamrpn_kernel_cls1.onnx?dl=0
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
class DaSiamRPNTracker:
|
||||
# Initialization of used values, initial bounding box, used network
|
||||
def __init__(self, net="dasiamrpn_model.onnx", kernel_r1="dasiamrpn_kernel_r1.onnx", kernel_cls1="dasiamrpn_kernel_cls1.onnx"):
|
||||
self.windowing = "cosine"
|
||||
self.exemplar_size = 127
|
||||
self.instance_size = 271
|
||||
self.total_stride = 8
|
||||
self.score_size = (self.instance_size - self.exemplar_size) // self.total_stride + 1
|
||||
self.context_amount = 0.5
|
||||
self.ratios = [0.33, 0.5, 1, 2, 3]
|
||||
self.scales = [8, ]
|
||||
self.anchor_num = len(self.ratios) * len(self.scales)
|
||||
self.penalty_k = 0.055
|
||||
self.window_influence = 0.42
|
||||
self.lr = 0.295
|
||||
self.score = []
|
||||
if self.windowing == "cosine":
|
||||
self.window = np.outer(np.hanning(self.score_size), np.hanning(self.score_size))
|
||||
elif self.windowing == "uniform":
|
||||
self.window = np.ones((self.score_size, self.score_size))
|
||||
self.window = np.tile(self.window.flatten(), self.anchor_num)
|
||||
# Loading network`s and kernel`s models
|
||||
self.net = cv.dnn.readNet(net)
|
||||
self.kernel_r1 = cv.dnn.readNet(kernel_r1)
|
||||
self.kernel_cls1 = cv.dnn.readNet(kernel_cls1)
|
||||
|
||||
def init(self, im, init_bb):
|
||||
target_pos, target_sz = np.array([init_bb[0], init_bb[1]]), np.array([init_bb[2], init_bb[3]])
|
||||
self.im_h = im.shape[0]
|
||||
self.im_w = im.shape[1]
|
||||
self.target_pos = target_pos
|
||||
self.target_sz = target_sz
|
||||
self.avg_chans = np.mean(im, axis=(0, 1))
|
||||
|
||||
# When we trying to generate ONNX model from the pre-trained .pth model
|
||||
# we are using only one state of the network. In our case used state
|
||||
# with big bounding box, so we were forced to add assertion for
|
||||
# too small bounding boxes - current state of the network can not
|
||||
# work properly with such small bounding boxes
|
||||
if ((self.target_sz[0] * self.target_sz[1]) / float(self.im_h * self.im_w)) < 0.004:
|
||||
raise AssertionError(
|
||||
"Initializing BB is too small-try to restart tracker with larger BB")
|
||||
|
||||
self.anchor = self.__generate_anchor()
|
||||
wc_z = self.target_sz[0] + self.context_amount * sum(self.target_sz)
|
||||
hc_z = self.target_sz[1] + self.context_amount * sum(self.target_sz)
|
||||
s_z = round(np.sqrt(wc_z * hc_z))
|
||||
z_crop = self.__get_subwindow_tracking(im, self.exemplar_size, s_z)
|
||||
z_crop = z_crop.transpose(2, 0, 1).reshape(1, 3, 127, 127).astype(np.float32)
|
||||
self.net.setInput(z_crop)
|
||||
z_f = self.net.forward('63')
|
||||
self.kernel_r1.setInput(z_f)
|
||||
r1 = self.kernel_r1.forward()
|
||||
self.kernel_cls1.setInput(z_f)
|
||||
cls1 = self.kernel_cls1.forward()
|
||||
r1 = r1.reshape(20, 256, 4, 4)
|
||||
cls1 = cls1.reshape(10, 256 , 4, 4)
|
||||
self.net.setParam(self.net.getLayerId('65'), 0, r1)
|
||||
self.net.setParam(self.net.getLayerId('68'), 0, cls1)
|
||||
|
||||
# Сreating anchor for tracking bounding box
|
||||
def __generate_anchor(self):
|
||||
self.anchor = np.zeros((self.anchor_num, 4), dtype = np.float32)
|
||||
size = self.total_stride * self.total_stride
|
||||
count = 0
|
||||
|
||||
for ratio in self.ratios:
|
||||
ws = int(np.sqrt(size / ratio))
|
||||
hs = int(ws * ratio)
|
||||
for scale in self.scales:
|
||||
wws = ws * scale
|
||||
hhs = hs * scale
|
||||
self.anchor[count] = [0, 0, wws, hhs]
|
||||
count += 1
|
||||
|
||||
score_sz = int(self.score_size)
|
||||
self.anchor = np.tile(self.anchor, score_sz * score_sz).reshape((-1, 4))
|
||||
ori = - (score_sz / 2) * self.total_stride
|
||||
xx, yy = np.meshgrid([ori + self.total_stride * dx for dx in range(score_sz)], [ori + self.total_stride * dy for dy in range(score_sz)])
|
||||
xx, yy = np.tile(xx.flatten(), (self.anchor_num, 1)).flatten(), np.tile(yy.flatten(), (self.anchor_num, 1)).flatten()
|
||||
self.anchor[:, 0], self.anchor[:, 1] = xx.astype(np.float32), yy.astype(np.float32)
|
||||
return self.anchor
|
||||
|
||||
# Function for updating tracker state
|
||||
def update(self, im):
|
||||
wc_z = self.target_sz[1] + self.context_amount * sum(self.target_sz)
|
||||
hc_z = self.target_sz[0] + self.context_amount * sum(self.target_sz)
|
||||
s_z = np.sqrt(wc_z * hc_z)
|
||||
scale_z = self.exemplar_size / s_z
|
||||
d_search = (self.instance_size - self.exemplar_size) / 2
|
||||
pad = d_search / scale_z
|
||||
s_x = round(s_z + 2 * pad)
|
||||
|
||||
# Region preprocessing part
|
||||
x_crop = self.__get_subwindow_tracking(im, self.instance_size, s_x)
|
||||
x_crop = x_crop.transpose(2, 0, 1).reshape(1, 3, 271, 271).astype(np.float32)
|
||||
self.score = self.__tracker_eval(x_crop, scale_z)
|
||||
self.target_pos[0] = max(0, min(self.im_w, self.target_pos[0]))
|
||||
self.target_pos[1] = max(0, min(self.im_h, self.target_pos[1]))
|
||||
self.target_sz[0] = max(10, min(self.im_w, self.target_sz[0]))
|
||||
self.target_sz[1] = max(10, min(self.im_h, self.target_sz[1]))
|
||||
|
||||
cx, cy = self.target_pos
|
||||
w, h = self.target_sz
|
||||
updated_bb = (cx, cy, w, h)
|
||||
return True, updated_bb
|
||||
|
||||
# Function for updating position of the bounding box
|
||||
def __tracker_eval(self, x_crop, scale_z):
|
||||
target_size = self.target_sz * scale_z
|
||||
self.net.setInput(x_crop)
|
||||
outNames = self.net.getUnconnectedOutLayersNames()
|
||||
outNames = ['66', '68']
|
||||
delta, score = self.net.forward(outNames)
|
||||
delta = np.transpose(delta, (1, 2, 3, 0))
|
||||
delta = np.ascontiguousarray(delta, dtype = np.float32)
|
||||
delta = np.reshape(delta, (4, -1))
|
||||
score = np.transpose(score, (1, 2, 3, 0))
|
||||
score = np.ascontiguousarray(score, dtype = np.float32)
|
||||
score = np.reshape(score, (2, -1))
|
||||
score = self.__softmax(score)[1, :]
|
||||
delta[0, :] = delta[0, :] * self.anchor[:, 2] + self.anchor[:, 0]
|
||||
delta[1, :] = delta[1, :] * self.anchor[:, 3] + self.anchor[:, 1]
|
||||
delta[2, :] = np.exp(delta[2, :]) * self.anchor[:, 2]
|
||||
delta[3, :] = np.exp(delta[3, :]) * self.anchor[:, 3]
|
||||
|
||||
def __change(r):
|
||||
return np.maximum(r, 1./r)
|
||||
|
||||
def __sz(w, h):
|
||||
pad = (w + h) * 0.5
|
||||
sz2 = (w + pad) * (h + pad)
|
||||
return np.sqrt(sz2)
|
||||
|
||||
def __sz_wh(wh):
|
||||
pad = (wh[0] + wh[1]) * 0.5
|
||||
sz2 = (wh[0] + pad) * (wh[1] + pad)
|
||||
return np.sqrt(sz2)
|
||||
|
||||
s_c = __change(__sz(delta[2, :], delta[3, :]) / (__sz_wh(target_size)))
|
||||
r_c = __change((target_size[0] / target_size[1]) / (delta[2, :] / delta[3, :]))
|
||||
penalty = np.exp(-(r_c * s_c - 1.) * self.penalty_k)
|
||||
pscore = penalty * score
|
||||
pscore = pscore * (1 - self.window_influence) + self.window * self.window_influence
|
||||
best_pscore_id = np.argmax(pscore)
|
||||
target = delta[:, best_pscore_id] / scale_z
|
||||
target_size /= scale_z
|
||||
lr = penalty[best_pscore_id] * score[best_pscore_id] * self.lr
|
||||
res_x = target[0] + self.target_pos[0]
|
||||
res_y = target[1] + self.target_pos[1]
|
||||
res_w = target_size[0] * (1 - lr) + target[2] * lr
|
||||
res_h = target_size[1] * (1 - lr) + target[3] * lr
|
||||
self.target_pos = np.array([res_x, res_y])
|
||||
self.target_sz = np.array([res_w, res_h])
|
||||
return score[best_pscore_id]
|
||||
|
||||
def __softmax(self, x):
|
||||
x_max = x.max(0)
|
||||
e_x = np.exp(x - x_max)
|
||||
y = e_x / e_x.sum(axis = 0)
|
||||
return y
|
||||
|
||||
# Reshaping cropped image for using in the model
|
||||
def __get_subwindow_tracking(self, im, model_size, original_sz):
|
||||
im_sz = im.shape
|
||||
c = (original_sz + 1) / 2
|
||||
context_xmin = round(self.target_pos[0] - c)
|
||||
context_xmax = context_xmin + original_sz - 1
|
||||
context_ymin = round(self.target_pos[1] - c)
|
||||
context_ymax = context_ymin + original_sz - 1
|
||||
left_pad = int(max(0., -context_xmin))
|
||||
top_pad = int(max(0., -context_ymin))
|
||||
right_pad = int(max(0., context_xmax - im_sz[1] + 1))
|
||||
bot_pad = int(max(0., context_ymax - im_sz[0] + 1))
|
||||
context_xmin += left_pad
|
||||
context_xmax += left_pad
|
||||
context_ymin += top_pad
|
||||
context_ymax += top_pad
|
||||
r, c, k = im.shape
|
||||
|
||||
if any([top_pad, bot_pad, left_pad, right_pad]):
|
||||
te_im = np.zeros((
|
||||
r + top_pad + bot_pad, c + left_pad + right_pad, k), np.uint8)
|
||||
te_im[top_pad:top_pad + r, left_pad:left_pad + c, :] = im
|
||||
if top_pad:
|
||||
te_im[0:top_pad, left_pad:left_pad + c, :] = self.avg_chans
|
||||
if bot_pad:
|
||||
te_im[r + top_pad:, left_pad:left_pad + c, :] = self.avg_chans
|
||||
if left_pad:
|
||||
te_im[:, 0:left_pad, :] = self.avg_chans
|
||||
if right_pad:
|
||||
te_im[:, c + left_pad:, :] = self.avg_chans
|
||||
im_patch_original = te_im[int(context_ymin):int(context_ymax + 1), int(context_xmin):int(context_xmax + 1), :]
|
||||
else:
|
||||
im_patch_original = im[int(context_ymin):int(context_ymax + 1), int(context_xmin):int(context_xmax + 1), :]
|
||||
|
||||
if not np.array_equal(model_size, original_sz):
|
||||
im_patch_original = cv.resize(im_patch_original, (model_size, model_size))
|
||||
return im_patch_original
|
||||
|
||||
# Sample for using DaSiamRPN tracker
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run tracker")
|
||||
parser.add_argument("--input", type=str, help="Full path to input (empty for camera)")
|
||||
parser.add_argument("--net", type=str, default="dasiamrpn_model.onnx", help="Full path to onnx model of net")
|
||||
parser.add_argument("--kernel_r1", type=str, default="dasiamrpn_kernel_r1.onnx", help="Full path to onnx model of kernel_r1")
|
||||
parser.add_argument("--kernel_cls1", type=str, default="dasiamrpn_kernel_cls1.onnx", help="Full path to onnx model of kernel_cls1")
|
||||
args = parser.parse_args()
|
||||
point1 = ()
|
||||
point2 = ()
|
||||
mark = True
|
||||
drawing = False
|
||||
cx, cy, w, h = 0.0, 0.0, 0, 0
|
||||
# Fucntion for drawing during videostream
|
||||
def get_bb(event, x, y, flag, param):
|
||||
nonlocal point1, point2, cx, cy, w, h, drawing, mark
|
||||
|
||||
if event == cv.EVENT_LBUTTONDOWN:
|
||||
if not drawing:
|
||||
drawing = True
|
||||
point1 = (x, y)
|
||||
else:
|
||||
drawing = False
|
||||
|
||||
elif event == cv.EVENT_MOUSEMOVE:
|
||||
if drawing:
|
||||
point2 = (x, y)
|
||||
|
||||
elif event == cv.EVENT_LBUTTONUP:
|
||||
cx = point1[0] - (point1[0] - point2[0]) / 2
|
||||
cy = point1[1] - (point1[1] - point2[1]) / 2
|
||||
w = abs(point1[0] - point2[0])
|
||||
h = abs(point1[1] - point2[1])
|
||||
mark = False
|
||||
|
||||
# Creating window for visualization
|
||||
cap = cv.VideoCapture(args.input if args.input else 0)
|
||||
cv.namedWindow("DaSiamRPN")
|
||||
cv.setMouseCallback("DaSiamRPN", get_bb)
|
||||
|
||||
whitespace_key = 32
|
||||
while cv.waitKey(40) != whitespace_key:
|
||||
has_frame, frame = cap.read()
|
||||
if not has_frame:
|
||||
sys.exit(0)
|
||||
cv.imshow("DaSiamRPN", frame)
|
||||
|
||||
while mark:
|
||||
twin = np.copy(frame)
|
||||
if point1 and point2:
|
||||
cv.rectangle(twin, point1, point2, (0, 255, 255), 3)
|
||||
cv.imshow("DaSiamRPN", twin)
|
||||
cv.waitKey(40)
|
||||
|
||||
init_bb = (cx, cy, w, h)
|
||||
tracker = DaSiamRPNTracker(args.net, args.kernel_r1, args.kernel_cls1)
|
||||
tracker.init(frame, init_bb)
|
||||
|
||||
# Tracking loop
|
||||
while cap.isOpened():
|
||||
has_frame, frame = cap.read()
|
||||
if not has_frame:
|
||||
sys.exit(0)
|
||||
_, new_bb = tracker.update(frame)
|
||||
cx, cy, w, h = new_bb
|
||||
cv.rectangle(frame, (int(cx - w // 2), int(cy - h // 2)), (int(cx - w // 2) + int(w), int(cy - h // 2) + int(h)),(0, 255, 255), 3)
|
||||
cv.imshow("DaSiamRPN", frame)
|
||||
key = cv.waitKey(1)
|
||||
if key == ord("q"):
|
||||
break
|
||||
|
||||
cap.release()
|
||||
cv.destroyAllWindows()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,23 @@
|
||||
from abc import ABC, ABCMeta, abstractmethod
|
||||
|
||||
|
||||
class AbstractModel(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def get_prepared_models(self):
|
||||
pass
|
||||
|
||||
|
||||
class Framework(object):
|
||||
in_blob_name = ''
|
||||
out_blob_name = ''
|
||||
|
||||
__metaclass__ = ABCMeta
|
||||
|
||||
@abstractmethod
|
||||
def get_name(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_output(self, input_blob):
|
||||
pass
|
||||
@@ -0,0 +1,96 @@
|
||||
import sys
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...utils import get_final_summary_info
|
||||
|
||||
|
||||
class ClsAccEvaluation:
|
||||
log = sys.stdout
|
||||
img_classes = {}
|
||||
batch_size = 0
|
||||
|
||||
def __init__(self, log_path, img_classes_file, batch_size):
|
||||
self.log = open(log_path, 'w')
|
||||
self.img_classes = self.read_classes(img_classes_file)
|
||||
self.batch_size = batch_size
|
||||
|
||||
# collect the accuracies for both models
|
||||
self.general_quality_metric = []
|
||||
self.general_inference_time = []
|
||||
|
||||
@staticmethod
|
||||
def read_classes(img_classes_file):
|
||||
result = {}
|
||||
with open(img_classes_file) as file:
|
||||
for l in file.readlines():
|
||||
result[l.split()[0]] = int(l.split()[1])
|
||||
return result
|
||||
|
||||
def get_correct_answers(self, img_list, net_output_blob):
|
||||
correct_answers = 0
|
||||
for i in range(len(img_list)):
|
||||
indexes = np.argsort(net_output_blob[i])[-5:]
|
||||
correct_index = self.img_classes[img_list[i]]
|
||||
if correct_index in indexes:
|
||||
correct_answers += 1
|
||||
return correct_answers
|
||||
|
||||
def process(self, frameworks, data_fetcher):
|
||||
sorted_imgs_names = sorted(self.img_classes.keys())
|
||||
correct_answers = [0] * len(frameworks)
|
||||
samples_handled = 0
|
||||
blobs_l1_diff = [0] * len(frameworks)
|
||||
blobs_l1_diff_count = [0] * len(frameworks)
|
||||
blobs_l_inf_diff = [sys.float_info.min] * len(frameworks)
|
||||
inference_time = [0.0] * len(frameworks)
|
||||
|
||||
for x in range(0, len(sorted_imgs_names), self.batch_size):
|
||||
sublist = sorted_imgs_names[x:x + self.batch_size]
|
||||
batch = data_fetcher.get_batch(sublist)
|
||||
|
||||
samples_handled += len(sublist)
|
||||
fw_accuracy = []
|
||||
fw_time = []
|
||||
frameworks_out = []
|
||||
for i in range(len(frameworks)):
|
||||
start = time.time()
|
||||
out = frameworks[i].get_output(batch)
|
||||
end = time.time()
|
||||
correct_answers[i] += self.get_correct_answers(sublist, out)
|
||||
fw_accuracy.append(100 * correct_answers[i] / float(samples_handled))
|
||||
frameworks_out.append(out)
|
||||
inference_time[i] += end - start
|
||||
fw_time.append(inference_time[i] / samples_handled * 1000)
|
||||
print(samples_handled, 'Accuracy for', frameworks[i].get_name() + ':', fw_accuracy[i], file=self.log)
|
||||
print("Inference time, ms ", frameworks[i].get_name(), fw_time[i], file=self.log)
|
||||
|
||||
self.general_quality_metric.append(fw_accuracy)
|
||||
self.general_inference_time.append(fw_time)
|
||||
|
||||
for i in range(1, len(frameworks)):
|
||||
log_str = frameworks[0].get_name() + " vs " + frameworks[i].get_name() + ':'
|
||||
diff = np.abs(frameworks_out[0] - frameworks_out[i])
|
||||
l1_diff = np.sum(diff) / diff.size
|
||||
print(samples_handled, "L1 difference", log_str, l1_diff, file=self.log)
|
||||
blobs_l1_diff[i] += l1_diff
|
||||
blobs_l1_diff_count[i] += 1
|
||||
if np.max(diff) > blobs_l_inf_diff[i]:
|
||||
blobs_l_inf_diff[i] = np.max(diff)
|
||||
print(samples_handled, "L_INF difference", log_str, blobs_l_inf_diff[i], file=self.log)
|
||||
|
||||
self.log.flush()
|
||||
|
||||
for i in range(1, len(blobs_l1_diff)):
|
||||
log_str = frameworks[0].get_name() + " vs " + frameworks[i].get_name() + ':'
|
||||
print('Final l1 diff', log_str, blobs_l1_diff[i] / blobs_l1_diff_count[i], file=self.log)
|
||||
|
||||
print(
|
||||
get_final_summary_info(
|
||||
self.general_quality_metric,
|
||||
self.general_inference_time,
|
||||
"accuracy"
|
||||
),
|
||||
file=self.log
|
||||
)
|
||||
@@ -0,0 +1,87 @@
|
||||
import os
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from ...img_utils import read_rgb_img, get_pytorch_preprocess
|
||||
from ...test.configs.default_preprocess_config import PYTORCH_RSZ_HEIGHT, PYTORCH_RSZ_WIDTH
|
||||
|
||||
|
||||
class DataFetch(object):
|
||||
imgs_dir = ''
|
||||
frame_size = 0
|
||||
bgr_to_rgb = False
|
||||
|
||||
__metaclass__ = ABCMeta
|
||||
|
||||
@abstractmethod
|
||||
def preprocess(self, img):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def reshape_img(img):
|
||||
img = img[:, :, 0:3].transpose(2, 0, 1)
|
||||
return np.expand_dims(img, 0)
|
||||
|
||||
def center_crop(self, img):
|
||||
cols = img.shape[1]
|
||||
rows = img.shape[0]
|
||||
|
||||
y1 = round((rows - self.frame_size) / 2)
|
||||
y2 = round(y1 + self.frame_size)
|
||||
x1 = round((cols - self.frame_size) / 2)
|
||||
x2 = round(x1 + self.frame_size)
|
||||
return img[y1:y2, x1:x2]
|
||||
|
||||
def initial_preprocess(self, img):
|
||||
min_dim = min(img.shape[-3], img.shape[-2])
|
||||
resize_ratio = self.frame_size / float(min_dim)
|
||||
|
||||
img = cv2.resize(img, (0, 0), fx=resize_ratio, fy=resize_ratio)
|
||||
img = self.center_crop(img)
|
||||
return img
|
||||
|
||||
def get_preprocessed_img(self, img_path):
|
||||
image_data = read_rgb_img(img_path, self.bgr_to_rgb)
|
||||
image_data = self.preprocess(image_data)
|
||||
return self.reshape_img(image_data)
|
||||
|
||||
def get_batch(self, img_names):
|
||||
assert type(img_names) is list
|
||||
batch = np.zeros((len(img_names), 3, self.frame_size, self.frame_size)).astype(np.float32)
|
||||
|
||||
for i in range(len(img_names)):
|
||||
img_name = img_names[i]
|
||||
img_file = os.path.join(self.imgs_dir, img_name)
|
||||
assert os.path.exists(img_file)
|
||||
|
||||
batch[i] = self.get_preprocessed_img(img_file)
|
||||
return batch
|
||||
|
||||
|
||||
class PyTorchPreprocessedFetch(DataFetch):
|
||||
def __init__(self, pytorch_cls_config, preprocess_input=None):
|
||||
self.imgs_dir = pytorch_cls_config.img_root_dir
|
||||
self.frame_size = pytorch_cls_config.frame_size
|
||||
self.bgr_to_rgb = pytorch_cls_config.bgr_to_rgb
|
||||
self.preprocess_input = preprocess_input
|
||||
|
||||
def preprocess(self, img):
|
||||
img = cv2.resize(img, (PYTORCH_RSZ_WIDTH, PYTORCH_RSZ_HEIGHT))
|
||||
img = self.center_crop(img)
|
||||
if self.preprocess_input:
|
||||
return self.presprocess_input(img)
|
||||
return get_pytorch_preprocess(img)
|
||||
|
||||
|
||||
class TFPreprocessedFetch(DataFetch):
|
||||
def __init__(self, tf_cls_config, preprocess_input):
|
||||
self.imgs_dir = tf_cls_config.img_root_dir
|
||||
self.frame_size = tf_cls_config.frame_size
|
||||
self.bgr_to_rgb = tf_cls_config.bgr_to_rgb
|
||||
self.preprocess_input = preprocess_input
|
||||
|
||||
def preprocess(self, img):
|
||||
img = self.initial_preprocess(img)
|
||||
return self.preprocess_input(img)
|
||||
@@ -0,0 +1,19 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from .test.configs.default_preprocess_config import BASE_IMG_SCALE_FACTOR
|
||||
|
||||
|
||||
def read_rgb_img(img_file, is_bgr_to_rgb=True):
|
||||
img = cv2.imread(img_file, cv2.IMREAD_COLOR)
|
||||
if is_bgr_to_rgb:
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
||||
return img
|
||||
|
||||
|
||||
def get_pytorch_preprocess(img):
|
||||
img = img.astype(np.float32)
|
||||
img *= BASE_IMG_SCALE_FACTOR
|
||||
img -= [0.485, 0.456, 0.406]
|
||||
img /= [0.229, 0.224, 0.225]
|
||||
return img
|
||||
@@ -0,0 +1,60 @@
|
||||
from .configs.test_config import TestClsConfig, TestClsModuleConfig
|
||||
from .model_test_pipeline import ModelTestPipeline
|
||||
from ..evaluation.classification.cls_accuracy_evaluator import ClsAccEvaluation
|
||||
from ..utils import get_test_module
|
||||
|
||||
|
||||
class ClsModelTestPipeline(ModelTestPipeline):
|
||||
def __init__(
|
||||
self,
|
||||
network_model,
|
||||
model_processor,
|
||||
dnn_model_processor,
|
||||
data_fetcher,
|
||||
img_processor=None,
|
||||
cls_args_parser=None,
|
||||
default_input_blob_preproc=None
|
||||
):
|
||||
super(ClsModelTestPipeline, self).__init__(
|
||||
network_model,
|
||||
model_processor,
|
||||
dnn_model_processor
|
||||
)
|
||||
|
||||
if cls_args_parser:
|
||||
self._parser = cls_args_parser
|
||||
|
||||
self.test_config = TestClsConfig()
|
||||
|
||||
parser_args = self._parser.parse_args()
|
||||
|
||||
if parser_args.test:
|
||||
self._test_module_config = TestClsModuleConfig()
|
||||
self._test_module = get_test_module(
|
||||
self._test_module_config.test_module_name,
|
||||
self._test_module_config.test_module_path
|
||||
)
|
||||
|
||||
if parser_args.default_img_preprocess:
|
||||
self._default_input_blob_preproc = default_input_blob_preproc
|
||||
if parser_args.evaluate:
|
||||
self._data_fetcher = data_fetcher(self.test_config, img_processor)
|
||||
|
||||
def _configure_test_module_params(self):
|
||||
self._test_module_param_list.extend((
|
||||
'--crop', self._test_module_config.crop,
|
||||
'--std', *self._test_module_config.std
|
||||
))
|
||||
|
||||
if self._test_module_config.rsz_height and self._test_module_config.rsz_width:
|
||||
self._test_module_param_list.extend((
|
||||
'--initial_height', self._test_module_config.rsz_height,
|
||||
'--initial_width', self._test_module_config.rsz_width,
|
||||
))
|
||||
|
||||
def _configure_acc_eval(self, log_path):
|
||||
self._accuracy_evaluator = ClsAccEvaluation(
|
||||
log_path,
|
||||
self.test_config.img_cls_file,
|
||||
self.test_config.batch_size
|
||||
)
|
||||
@@ -0,0 +1,37 @@
|
||||
BASE_IMG_SCALE_FACTOR = 1 / 255.0
|
||||
PYTORCH_RSZ_HEIGHT = 256
|
||||
PYTORCH_RSZ_WIDTH = 256
|
||||
|
||||
pytorch_resize_input_blob = {
|
||||
"mean": ["123.675", "116.28", "103.53"],
|
||||
"scale": str(BASE_IMG_SCALE_FACTOR),
|
||||
"std": ["0.229", "0.224", "0.225"],
|
||||
"crop": "True",
|
||||
"rgb": True,
|
||||
"rsz_height": str(PYTORCH_RSZ_HEIGHT),
|
||||
"rsz_width": str(PYTORCH_RSZ_WIDTH)
|
||||
}
|
||||
|
||||
pytorch_input_blob = {
|
||||
"mean": ["123.675", "116.28", "103.53"],
|
||||
"scale": str(BASE_IMG_SCALE_FACTOR),
|
||||
"std": ["0.229", "0.224", "0.225"],
|
||||
"crop": "True",
|
||||
"rgb": True
|
||||
}
|
||||
|
||||
tf_input_blob = {
|
||||
"scale": str(1 / 127.5),
|
||||
"mean": ["127.5", "127.5", "127.5"],
|
||||
"std": [],
|
||||
"crop": "True",
|
||||
"rgb": True
|
||||
}
|
||||
|
||||
tf_model_blob_caffe_mode = {
|
||||
"mean": ["103.939", "116.779", "123.68"],
|
||||
"scale": "1.0",
|
||||
"std": [],
|
||||
"crop": "True",
|
||||
"rgb": False
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List
|
||||
|
||||
|
||||
@dataclass
|
||||
class CommonConfig:
|
||||
output_data_root_dir: str = "dnn_model_runner/dnn_conversion"
|
||||
logs_dir: str = os.path.join(output_data_root_dir, "logs")
|
||||
log_file_path: str = os.path.join(logs_dir, "{}_log.txt")
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestClsConfig:
|
||||
batch_size: int = 1
|
||||
frame_size: int = 224
|
||||
img_root_dir: str = "./ILSVRC2012_img_val"
|
||||
# location of image-class matching
|
||||
img_cls_file: str = "./val.txt"
|
||||
bgr_to_rgb: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestClsModuleConfig:
|
||||
cls_test_data_dir: str = "../data"
|
||||
test_module_name: str = "classification"
|
||||
test_module_path: str = "classification.py"
|
||||
input_img: str = os.path.join(cls_test_data_dir, "squirrel_cls.jpg")
|
||||
model: str = ""
|
||||
|
||||
frame_height: str = str(TestClsConfig.frame_size)
|
||||
frame_width: str = str(TestClsConfig.frame_size)
|
||||
scale: str = "1.0"
|
||||
mean: List[str] = field(default_factory=lambda: ["0.0", "0.0", "0.0"])
|
||||
std: List[str] = field(default_factory=list)
|
||||
crop: str = "False"
|
||||
rgb: str = "True"
|
||||
rsz_height: str = ""
|
||||
rsz_width: str = ""
|
||||
classes: str = os.path.join(cls_test_data_dir, "dnn", "classification_classes_ILSVRC2012.txt")
|
||||
@@ -0,0 +1,126 @@
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .configs.test_config import CommonConfig
|
||||
from ..utils import create_parser, plot_acc
|
||||
|
||||
|
||||
class ModelTestPipeline:
|
||||
def __init__(
|
||||
self,
|
||||
network_model,
|
||||
model_processor,
|
||||
dnn_model_processor
|
||||
):
|
||||
self._net_model = network_model
|
||||
self._model_processor = model_processor
|
||||
self._dnn_model_processor = dnn_model_processor
|
||||
|
||||
self._parser = create_parser()
|
||||
|
||||
self._test_module = None
|
||||
self._test_module_config = None
|
||||
self._test_module_param_list = None
|
||||
|
||||
self.test_config = None
|
||||
self._data_fetcher = None
|
||||
|
||||
self._default_input_blob_preproc = None
|
||||
self._accuracy_evaluator = None
|
||||
|
||||
def init_test_pipeline(self):
|
||||
cmd_args = self._parser.parse_args()
|
||||
model_dict = self._net_model.get_prepared_models()
|
||||
|
||||
model_names = list(model_dict.keys())
|
||||
print(
|
||||
"The model {} was successfully obtained and converted to OpenCV {}".format(model_names[0], model_names[1])
|
||||
)
|
||||
|
||||
if cmd_args.test:
|
||||
if not self._test_module_config.model:
|
||||
self._test_module_config.model = self._net_model.model_path["full_path"]
|
||||
|
||||
if cmd_args.default_img_preprocess:
|
||||
self._test_module_config.scale = self._default_input_blob_preproc["scale"]
|
||||
self._test_module_config.mean = self._default_input_blob_preproc["mean"]
|
||||
self._test_module_config.std = self._default_input_blob_preproc["std"]
|
||||
self._test_module_config.crop = self._default_input_blob_preproc["crop"]
|
||||
|
||||
if "rsz_height" in self._default_input_blob_preproc and "rsz_width" in self._default_input_blob_preproc:
|
||||
self._test_module_config.rsz_height = self._default_input_blob_preproc["rsz_height"]
|
||||
self._test_module_config.rsz_width = self._default_input_blob_preproc["rsz_width"]
|
||||
|
||||
self._test_module_param_list = [
|
||||
'--model', self._test_module_config.model,
|
||||
'--input', self._test_module_config.input_img,
|
||||
'--width', self._test_module_config.frame_width,
|
||||
'--height', self._test_module_config.frame_height,
|
||||
'--scale', self._test_module_config.scale,
|
||||
'--mean', *self._test_module_config.mean,
|
||||
'--std', *self._test_module_config.std,
|
||||
'--classes', self._test_module_config.classes,
|
||||
]
|
||||
|
||||
if self._default_input_blob_preproc["rgb"]:
|
||||
self._test_module_param_list.append('--rgb')
|
||||
|
||||
self._configure_test_module_params()
|
||||
|
||||
self._test_module.main(
|
||||
self._test_module_param_list
|
||||
)
|
||||
|
||||
if cmd_args.evaluate:
|
||||
original_model_name = model_names[0]
|
||||
dnn_model_name = model_names[1]
|
||||
|
||||
self.run_test_pipeline(
|
||||
[
|
||||
self._model_processor(model_dict[original_model_name], original_model_name),
|
||||
self._dnn_model_processor(model_dict[dnn_model_name], dnn_model_name)
|
||||
],
|
||||
original_model_name.replace(" ", "_")
|
||||
)
|
||||
|
||||
def run_test_pipeline(
|
||||
self,
|
||||
models_list,
|
||||
formatted_exp_name,
|
||||
is_plot_acc=True
|
||||
):
|
||||
log_path, logs_dir = self._configure_eval_log(formatted_exp_name)
|
||||
|
||||
print(
|
||||
"===== Running evaluation of the model with the following params:\n"
|
||||
"\t* val data location: {}\n"
|
||||
"\t* log file location: {}\n".format(
|
||||
self.test_config.img_root_dir,
|
||||
log_path
|
||||
)
|
||||
)
|
||||
|
||||
os.makedirs(logs_dir, exist_ok=True)
|
||||
|
||||
self._configure_acc_eval(log_path)
|
||||
self._accuracy_evaluator.process(models_list, self._data_fetcher)
|
||||
|
||||
if is_plot_acc:
|
||||
plot_acc(
|
||||
np.array(self._accuracy_evaluator.general_inference_time),
|
||||
formatted_exp_name
|
||||
)
|
||||
|
||||
print("===== End of the evaluation pipeline =====")
|
||||
|
||||
def _configure_acc_eval(self, log_path):
|
||||
pass
|
||||
|
||||
def _configure_test_module_params(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def _configure_eval_log(formatted_exp_name):
|
||||
common_test_config = CommonConfig()
|
||||
return common_test_config.log_file_path.format(formatted_exp_name), common_test_config.logs_dir
|
||||
153
samples/dnn/dnn_model_runner/dnn_conversion/common/utils.py
Normal file
153
samples/dnn/dnn_model_runner/dnn_conversion/common/utils.py
Normal file
@@ -0,0 +1,153 @@
|
||||
import argparse
|
||||
import importlib.util
|
||||
import os
|
||||
import random
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import torch
|
||||
|
||||
from .test.configs.test_config import CommonConfig
|
||||
|
||||
SEED_VAL = 42
|
||||
DNN_LIB = "DNN"
|
||||
# common path for model savings
|
||||
MODEL_PATH_ROOT = os.path.join(CommonConfig().output_data_root_dir, "{}/models")
|
||||
|
||||
|
||||
def get_full_model_path(lib_name, model_full_name):
|
||||
model_path = MODEL_PATH_ROOT.format(lib_name)
|
||||
return {
|
||||
"path": model_path,
|
||||
"full_path": os.path.join(model_path, model_full_name)
|
||||
}
|
||||
|
||||
|
||||
def plot_acc(data_list, experiment_name):
|
||||
plt.figure(figsize=[8, 6])
|
||||
plt.plot(data_list[:, 0], "r", linewidth=2.5, label="Original Model")
|
||||
plt.plot(data_list[:, 1], "b", linewidth=2.5, label="Converted DNN Model")
|
||||
plt.xlabel("Iterations ", fontsize=15)
|
||||
plt.ylabel("Time (ms)", fontsize=15)
|
||||
plt.title(experiment_name, fontsize=15)
|
||||
plt.legend()
|
||||
full_path_to_fig = os.path.join(CommonConfig().output_data_root_dir, experiment_name + ".png")
|
||||
plt.savefig(full_path_to_fig, bbox_inches="tight")
|
||||
|
||||
|
||||
def get_final_summary_info(general_quality_metric, general_inference_time, metric_name):
|
||||
general_quality_metric = np.array(general_quality_metric)
|
||||
general_inference_time = np.array(general_inference_time)
|
||||
summary_line = "===== End of processing. General results:\n"
|
||||
"\t* mean {} for the original model: {}\t"
|
||||
"\t* mean time (min) for the original model inferences: {}\n"
|
||||
"\t* mean {} for the DNN model: {}\t"
|
||||
"\t* mean time (min) for the DNN model inferences: {}\n".format(
|
||||
metric_name, np.mean(general_quality_metric[:, 0]),
|
||||
np.mean(general_inference_time[:, 0]) / 60000,
|
||||
metric_name, np.mean(general_quality_metric[:, 1]),
|
||||
np.mean(general_inference_time[:, 1]) / 60000,
|
||||
)
|
||||
return summary_line
|
||||
|
||||
|
||||
def set_common_reproducibility():
|
||||
random.seed(SEED_VAL)
|
||||
np.random.seed(SEED_VAL)
|
||||
|
||||
|
||||
def set_pytorch_env():
|
||||
set_common_reproducibility()
|
||||
torch.manual_seed(SEED_VAL)
|
||||
torch.set_printoptions(precision=10)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(SEED_VAL)
|
||||
torch.backends.cudnn_benchmark_enabled = False
|
||||
torch.backends.cudnn.deterministic = True
|
||||
|
||||
|
||||
def set_tf_env(is_use_gpu=True):
|
||||
set_common_reproducibility()
|
||||
tf.random.set_seed(SEED_VAL)
|
||||
os.environ["TF_DETERMINISTIC_OPS"] = "1"
|
||||
|
||||
if tf.config.list_physical_devices("GPU") and is_use_gpu:
|
||||
gpu_devices = tf.config.list_physical_devices("GPU")
|
||||
tf.config.experimental.set_visible_devices(gpu_devices[0], "GPU")
|
||||
tf.config.experimental.set_memory_growth(gpu_devices[0], True)
|
||||
os.environ["TF_USE_CUDNN"] = "1"
|
||||
else:
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
||||
|
||||
|
||||
def str_bool(input_val):
|
||||
if input_val.lower() in ('yes', 'true', 't', 'y', '1'):
|
||||
return True
|
||||
elif input_val.lower() in ('no', 'false', 'f', 'n', '0'):
|
||||
return False
|
||||
else:
|
||||
raise argparse.ArgumentTypeError('Boolean value was expected')
|
||||
|
||||
|
||||
def get_formatted_model_list(model_list):
|
||||
note_line = 'Please, choose the model from the below list:\n'
|
||||
spaces_to_set = ' ' * (len(note_line) - 2)
|
||||
return note_line + ''.join([spaces_to_set, '{} \n'] * len(model_list)).format(*model_list)
|
||||
|
||||
|
||||
def model_str(model_list):
|
||||
def type_model_list(input_val):
|
||||
if input_val.lower() in model_list:
|
||||
return input_val.lower()
|
||||
else:
|
||||
raise argparse.ArgumentTypeError(
|
||||
'The model is currently unavailable for test.\n' +
|
||||
get_formatted_model_list(model_list)
|
||||
)
|
||||
|
||||
return type_model_list
|
||||
|
||||
|
||||
def get_test_module(test_module_name, test_module_path):
|
||||
module_spec = importlib.util.spec_from_file_location(test_module_name, test_module_path)
|
||||
test_module = importlib.util.module_from_spec(module_spec)
|
||||
module_spec.loader.exec_module(test_module)
|
||||
module_spec.loader.exec_module(test_module)
|
||||
return test_module
|
||||
|
||||
|
||||
def create_parser():
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
type=str_bool,
|
||||
help="Define whether you'd like to run the model with OpenCV for testing.",
|
||||
default=False
|
||||
),
|
||||
parser.add_argument(
|
||||
"--default_img_preprocess",
|
||||
type=str_bool,
|
||||
help="Define whether you'd like to preprocess the input image with defined"
|
||||
" PyTorch or TF functions for model test with OpenCV.",
|
||||
default=False
|
||||
),
|
||||
parser.add_argument(
|
||||
"--evaluate",
|
||||
type=str_bool,
|
||||
help="Define whether you'd like to run evaluation of the models (ex.: TF vs OpenCV networks).",
|
||||
default=True
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def create_extended_parser(model_list):
|
||||
parser = create_parser()
|
||||
parser.add_argument(
|
||||
"--model_name",
|
||||
type=model_str(model_list=model_list),
|
||||
help="\nDefine the model name to test.\n" +
|
||||
get_formatted_model_list(model_list),
|
||||
required=True
|
||||
)
|
||||
return parser
|
||||
@@ -0,0 +1,71 @@
|
||||
from torchvision import models
|
||||
|
||||
from ..pytorch_model import (
|
||||
PyTorchModelPreparer,
|
||||
PyTorchModelProcessor,
|
||||
PyTorchDnnModelProcessor
|
||||
)
|
||||
from ...common.evaluation.classification.cls_data_fetcher import PyTorchPreprocessedFetch
|
||||
from ...common.test.cls_model_test_pipeline import ClsModelTestPipeline
|
||||
from ...common.test.configs.default_preprocess_config import pytorch_resize_input_blob
|
||||
from ...common.test.configs.test_config import TestClsConfig
|
||||
from ...common.utils import set_pytorch_env, create_extended_parser
|
||||
|
||||
model_dict = {
|
||||
"alexnet": models.alexnet,
|
||||
|
||||
"vgg11": models.vgg11,
|
||||
"vgg13": models.vgg13,
|
||||
"vgg16": models.vgg16,
|
||||
"vgg19": models.vgg19,
|
||||
|
||||
"resnet18": models.resnet18,
|
||||
"resnet34": models.resnet34,
|
||||
"resnet50": models.resnet50,
|
||||
"resnet101": models.resnet101,
|
||||
"resnet152": models.resnet152,
|
||||
|
||||
"squeezenet1_0": models.squeezenet1_0,
|
||||
"squeezenet1_1": models.squeezenet1_1,
|
||||
|
||||
"resnext50_32x4d": models.resnext50_32x4d,
|
||||
"resnext101_32x8d": models.resnext101_32x8d,
|
||||
|
||||
"wide_resnet50_2": models.wide_resnet50_2,
|
||||
"wide_resnet101_2": models.wide_resnet101_2
|
||||
}
|
||||
|
||||
|
||||
class PyTorchClsModel(PyTorchModelPreparer):
|
||||
def __init__(self, height, width, model_name, original_model):
|
||||
super(PyTorchClsModel, self).__init__(height, width, model_name, original_model)
|
||||
|
||||
|
||||
def main():
|
||||
set_pytorch_env()
|
||||
|
||||
parser = create_extended_parser(list(model_dict.keys()))
|
||||
cmd_args = parser.parse_args()
|
||||
model_name = cmd_args.model_name
|
||||
|
||||
cls_model = PyTorchClsModel(
|
||||
height=TestClsConfig().frame_size,
|
||||
width=TestClsConfig().frame_size,
|
||||
model_name=model_name,
|
||||
original_model=model_dict[model_name](pretrained=True)
|
||||
)
|
||||
|
||||
pytorch_cls_pipeline = ClsModelTestPipeline(
|
||||
network_model=cls_model,
|
||||
model_processor=PyTorchModelProcessor,
|
||||
dnn_model_processor=PyTorchDnnModelProcessor,
|
||||
data_fetcher=PyTorchPreprocessedFetch,
|
||||
cls_args_parser=parser,
|
||||
default_input_blob_preproc=pytorch_resize_input_blob
|
||||
)
|
||||
|
||||
pytorch_cls_pipeline.init_test_pipeline()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,139 @@
|
||||
import os
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.onnx
|
||||
from torch.autograd import Variable
|
||||
from torchvision import models
|
||||
|
||||
|
||||
def get_pytorch_onnx_model(original_model):
|
||||
# define the directory for further converted model save
|
||||
onnx_model_path = "models"
|
||||
# define the name of further converted model
|
||||
onnx_model_name = "resnet50.onnx"
|
||||
|
||||
# create directory for further converted model
|
||||
os.makedirs(onnx_model_path, exist_ok=True)
|
||||
|
||||
# get full path to the converted model
|
||||
full_model_path = os.path.join(onnx_model_path, onnx_model_name)
|
||||
|
||||
# generate model input
|
||||
generated_input = Variable(
|
||||
torch.randn(1, 3, 224, 224)
|
||||
)
|
||||
|
||||
# model export into ONNX format
|
||||
torch.onnx.export(
|
||||
original_model,
|
||||
generated_input,
|
||||
full_model_path,
|
||||
verbose=True,
|
||||
input_names=["input"],
|
||||
output_names=["output"],
|
||||
opset_version=11
|
||||
)
|
||||
|
||||
return full_model_path
|
||||
|
||||
|
||||
def get_preprocessed_img(img_path):
|
||||
# read the image
|
||||
input_img = cv2.imread(img_path, cv2.IMREAD_COLOR)
|
||||
input_img = input_img.astype(np.float32)
|
||||
|
||||
input_img = cv2.resize(input_img, (256, 256))
|
||||
|
||||
# define preprocess parameters
|
||||
mean = np.array([0.485, 0.456, 0.406]) * 255.0
|
||||
scale = 1 / 255.0
|
||||
std = [0.229, 0.224, 0.225]
|
||||
|
||||
# prepare input blob to fit the model input:
|
||||
# 1. subtract mean
|
||||
# 2. scale to set pixel values from 0 to 1
|
||||
input_blob = cv2.dnn.blobFromImage(
|
||||
image=input_img,
|
||||
scalefactor=scale,
|
||||
size=(224, 224), # img target size
|
||||
mean=mean,
|
||||
swapRB=True, # BGR -> RGB
|
||||
crop=True # center crop
|
||||
)
|
||||
# 3. divide by std
|
||||
input_blob[0] /= np.asarray(std, dtype=np.float32).reshape(3, 1, 1)
|
||||
return input_blob
|
||||
|
||||
|
||||
def get_imagenet_labels(labels_path):
|
||||
with open(labels_path) as f:
|
||||
imagenet_labels = [line.strip() for line in f.readlines()]
|
||||
return imagenet_labels
|
||||
|
||||
|
||||
def get_opencv_dnn_prediction(opencv_net, preproc_img, imagenet_labels):
|
||||
# set OpenCV DNN input
|
||||
opencv_net.setInput(preproc_img)
|
||||
|
||||
# OpenCV DNN inference
|
||||
out = opencv_net.forward()
|
||||
print("OpenCV DNN prediction: \n")
|
||||
print("* shape: ", out.shape)
|
||||
|
||||
# get the predicted class ID
|
||||
imagenet_class_id = np.argmax(out)
|
||||
|
||||
# get confidence
|
||||
confidence = out[0][imagenet_class_id]
|
||||
print("* class ID: {}, label: {}".format(imagenet_class_id, imagenet_labels[imagenet_class_id]))
|
||||
print("* confidence: {:.4f}".format(confidence))
|
||||
|
||||
|
||||
def get_pytorch_dnn_prediction(original_net, preproc_img, imagenet_labels):
|
||||
original_net.eval()
|
||||
preproc_img = torch.FloatTensor(preproc_img)
|
||||
|
||||
# inference
|
||||
with torch.no_grad():
|
||||
out = original_net(preproc_img)
|
||||
|
||||
print("\nPyTorch model prediction: \n")
|
||||
print("* shape: ", out.shape)
|
||||
|
||||
# get the predicted class ID
|
||||
imagenet_class_id = torch.argmax(out, axis=1).item()
|
||||
print("* class ID: {}, label: {}".format(imagenet_class_id, imagenet_labels[imagenet_class_id]))
|
||||
|
||||
# get confidence
|
||||
confidence = out[0][imagenet_class_id]
|
||||
print("* confidence: {:.4f}".format(confidence.item()))
|
||||
|
||||
|
||||
def main():
|
||||
# initialize PyTorch ResNet-50 model
|
||||
original_model = models.resnet50(pretrained=True)
|
||||
|
||||
# get the path to the converted into ONNX PyTorch model
|
||||
full_model_path = get_pytorch_onnx_model(original_model)
|
||||
|
||||
# read converted .onnx model with OpenCV API
|
||||
opencv_net = cv2.dnn.readNetFromONNX(full_model_path)
|
||||
print("OpenCV model was successfully read. Layer IDs: \n", opencv_net.getLayerNames())
|
||||
|
||||
# get preprocessed image
|
||||
input_img = get_preprocessed_img("../data/squirrel_cls.jpg")
|
||||
|
||||
# get ImageNet labels
|
||||
imagenet_labels = get_imagenet_labels("../data/dnn/classification_classes_ILSVRC2012.txt")
|
||||
|
||||
# obtain OpenCV DNN predictions
|
||||
get_opencv_dnn_prediction(opencv_net, input_img, imagenet_labels)
|
||||
|
||||
# obtain original PyTorch ResNet50 predictions
|
||||
get_pytorch_dnn_prediction(original_model, input_img, imagenet_labels)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,50 @@
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.onnx
|
||||
from torch.autograd import Variable
|
||||
from torchvision import models
|
||||
|
||||
|
||||
def get_pytorch_onnx_model(original_model):
|
||||
# define the directory for further converted model save
|
||||
onnx_model_path = "models"
|
||||
# define the name of further converted model
|
||||
onnx_model_name = "resnet50.onnx"
|
||||
|
||||
# create directory for further converted model
|
||||
os.makedirs(onnx_model_path, exist_ok=True)
|
||||
|
||||
# get full path to the converted model
|
||||
full_model_path = os.path.join(onnx_model_path, onnx_model_name)
|
||||
|
||||
# generate model input
|
||||
generated_input = Variable(
|
||||
torch.randn(1, 3, 224, 224)
|
||||
)
|
||||
|
||||
# model export into ONNX format
|
||||
torch.onnx.export(
|
||||
original_model,
|
||||
generated_input,
|
||||
full_model_path,
|
||||
verbose=True,
|
||||
input_names=["input"],
|
||||
output_names=["output"],
|
||||
opset_version=11
|
||||
)
|
||||
|
||||
return full_model_path
|
||||
|
||||
|
||||
def main():
|
||||
# initialize PyTorch ResNet-50 model
|
||||
original_model = models.resnet50(pretrained=True)
|
||||
|
||||
# get the path to the converted into ONNX PyTorch model
|
||||
full_model_path = get_pytorch_onnx_model(original_model)
|
||||
print("PyTorch ResNet-50 model was successfully converted: ", full_model_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,98 @@
|
||||
import os
|
||||
|
||||
import cv2
|
||||
import torch.onnx
|
||||
from torch.autograd import Variable
|
||||
|
||||
from ..common.abstract_model import AbstractModel, Framework
|
||||
from ..common.utils import DNN_LIB, get_full_model_path
|
||||
|
||||
CURRENT_LIB = "PyTorch"
|
||||
MODEL_FORMAT = ".onnx"
|
||||
|
||||
|
||||
class PyTorchModelPreparer(AbstractModel):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
height,
|
||||
width,
|
||||
model_name="default",
|
||||
original_model=object,
|
||||
batch_size=1,
|
||||
default_input_name="input",
|
||||
default_output_name="output"
|
||||
):
|
||||
self._height = height
|
||||
self._width = width
|
||||
self._model_name = model_name
|
||||
self._original_model = original_model
|
||||
self._batch_size = batch_size
|
||||
self._default_input_name = default_input_name
|
||||
self._default_output_name = default_output_name
|
||||
|
||||
self.model_path = self._set_model_path()
|
||||
self._dnn_model = self._set_dnn_model()
|
||||
|
||||
def _set_dnn_model(self):
|
||||
generated_input = Variable(torch.randn(
|
||||
self._batch_size, 3, self._height, self._width)
|
||||
)
|
||||
os.makedirs(self.model_path["path"], exist_ok=True)
|
||||
torch.onnx.export(
|
||||
self._original_model,
|
||||
generated_input,
|
||||
self.model_path["full_path"],
|
||||
verbose=True,
|
||||
input_names=[self._default_input_name],
|
||||
output_names=[self._default_output_name],
|
||||
opset_version=11
|
||||
)
|
||||
|
||||
return cv2.dnn.readNetFromONNX(self.model_path["full_path"])
|
||||
|
||||
def _set_model_path(self):
|
||||
model_to_save = self._model_name + MODEL_FORMAT
|
||||
return get_full_model_path(CURRENT_LIB.lower(), model_to_save)
|
||||
|
||||
def get_prepared_models(self):
|
||||
return {
|
||||
CURRENT_LIB + " " + self._model_name: self._original_model,
|
||||
DNN_LIB + " " + self._model_name: self._dnn_model
|
||||
}
|
||||
|
||||
|
||||
class PyTorchModelProcessor(Framework):
|
||||
def __init__(self, prepared_model, model_name):
|
||||
self._prepared_model = prepared_model
|
||||
self._name = model_name
|
||||
|
||||
def get_output(self, input_blob):
|
||||
tensor = torch.FloatTensor(input_blob)
|
||||
self._prepared_model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
model_out = self._prepared_model(tensor)
|
||||
|
||||
# segmentation case
|
||||
if len(model_out) == 2:
|
||||
model_out = model_out['out']
|
||||
|
||||
out = model_out.detach().numpy()
|
||||
return out
|
||||
|
||||
def get_name(self):
|
||||
return self._name
|
||||
|
||||
|
||||
class PyTorchDnnModelProcessor(Framework):
|
||||
def __init__(self, prepared_dnn_model, model_name):
|
||||
self._prepared_dnn_model = prepared_dnn_model
|
||||
self._name = model_name
|
||||
|
||||
def get_output(self, input_blob):
|
||||
self._prepared_dnn_model.setInput(input_blob, '')
|
||||
return self._prepared_dnn_model.forward()
|
||||
|
||||
def get_name(self):
|
||||
return self._name
|
||||
@@ -0,0 +1,9 @@
|
||||
# Python 3.7.5
|
||||
onnx>=1.7.0
|
||||
numpy>=1.19.1
|
||||
|
||||
torch>=1.5.1
|
||||
torchvision>=0.6.1
|
||||
|
||||
tensorflow>=2.1.0
|
||||
tensorflow-gpu>=2.1.0
|
||||
@@ -0,0 +1,104 @@
|
||||
from tensorflow.keras.applications import (
|
||||
VGG16, vgg16,
|
||||
VGG19, vgg19,
|
||||
|
||||
ResNet50, resnet,
|
||||
ResNet101,
|
||||
ResNet152,
|
||||
|
||||
DenseNet121, densenet,
|
||||
DenseNet169,
|
||||
DenseNet201,
|
||||
|
||||
InceptionResNetV2, inception_resnet_v2,
|
||||
InceptionV3, inception_v3,
|
||||
|
||||
MobileNet, mobilenet,
|
||||
MobileNetV2, mobilenet_v2,
|
||||
|
||||
NASNetLarge, nasnet,
|
||||
NASNetMobile,
|
||||
|
||||
Xception, xception
|
||||
)
|
||||
|
||||
from ..tf_model import TFModelPreparer
|
||||
from ..tf_model import (
|
||||
TFModelProcessor,
|
||||
TFDnnModelProcessor
|
||||
)
|
||||
from ...common.evaluation.classification.cls_data_fetcher import TFPreprocessedFetch
|
||||
from ...common.test.cls_model_test_pipeline import ClsModelTestPipeline
|
||||
from ...common.test.configs.default_preprocess_config import (
|
||||
tf_input_blob,
|
||||
pytorch_input_blob,
|
||||
tf_model_blob_caffe_mode
|
||||
)
|
||||
from ...common.utils import set_tf_env, create_extended_parser
|
||||
|
||||
model_dict = {
|
||||
"vgg16": [VGG16, vgg16, tf_model_blob_caffe_mode],
|
||||
"vgg19": [VGG19, vgg19, tf_model_blob_caffe_mode],
|
||||
|
||||
"resnet50": [ResNet50, resnet, tf_model_blob_caffe_mode],
|
||||
"resnet101": [ResNet101, resnet, tf_model_blob_caffe_mode],
|
||||
"resnet152": [ResNet152, resnet, tf_model_blob_caffe_mode],
|
||||
|
||||
"densenet121": [DenseNet121, densenet, pytorch_input_blob],
|
||||
"densenet169": [DenseNet169, densenet, pytorch_input_blob],
|
||||
"densenet201": [DenseNet201, densenet, pytorch_input_blob],
|
||||
|
||||
"inceptionresnetv2": [InceptionResNetV2, inception_resnet_v2, tf_input_blob],
|
||||
"inceptionv3": [InceptionV3, inception_v3, tf_input_blob],
|
||||
|
||||
"mobilenet": [MobileNet, mobilenet, tf_input_blob],
|
||||
"mobilenetv2": [MobileNetV2, mobilenet_v2, tf_input_blob],
|
||||
|
||||
"nasnetlarge": [NASNetLarge, nasnet, tf_input_blob],
|
||||
"nasnetmobile": [NASNetMobile, nasnet, tf_input_blob],
|
||||
|
||||
"xception": [Xception, xception, tf_input_blob]
|
||||
}
|
||||
|
||||
CNN_CLASS_ID = 0
|
||||
CNN_UTILS_ID = 1
|
||||
DEFAULT_BLOB_PARAMS_ID = 2
|
||||
|
||||
|
||||
class TFClsModel(TFModelPreparer):
|
||||
def __init__(self, model_name, original_model):
|
||||
super(TFClsModel, self).__init__(model_name, original_model)
|
||||
|
||||
|
||||
def main():
|
||||
set_tf_env()
|
||||
|
||||
parser = create_extended_parser(list(model_dict.keys()))
|
||||
cmd_args = parser.parse_args()
|
||||
|
||||
model_name = cmd_args.model_name
|
||||
model_name_val = model_dict[model_name]
|
||||
|
||||
cls_model = TFClsModel(
|
||||
model_name=model_name,
|
||||
original_model=model_name_val[CNN_CLASS_ID](
|
||||
include_top=True,
|
||||
weights="imagenet"
|
||||
)
|
||||
)
|
||||
|
||||
tf_cls_pipeline = ClsModelTestPipeline(
|
||||
network_model=cls_model,
|
||||
model_processor=TFModelProcessor,
|
||||
dnn_model_processor=TFDnnModelProcessor,
|
||||
data_fetcher=TFPreprocessedFetch,
|
||||
img_processor=model_name_val[CNN_UTILS_ID].preprocess_input,
|
||||
cls_args_parser=parser,
|
||||
default_input_blob_preproc=model_name_val[DEFAULT_BLOB_PARAMS_ID]
|
||||
)
|
||||
|
||||
tf_cls_pipeline.init_test_pipeline()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,142 @@
|
||||
import os
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.applications import MobileNet
|
||||
from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2
|
||||
|
||||
from ...common.utils import set_tf_env
|
||||
|
||||
|
||||
def get_tf_model_proto(tf_model):
|
||||
# define the directory for .pb model
|
||||
pb_model_path = "models"
|
||||
|
||||
# define the name of .pb model
|
||||
pb_model_name = "mobilenet.pb"
|
||||
|
||||
# create directory for further converted model
|
||||
os.makedirs(pb_model_path, exist_ok=True)
|
||||
|
||||
# get model TF graph
|
||||
tf_model_graph = tf.function(lambda x: tf_model(x))
|
||||
|
||||
# get concrete function
|
||||
tf_model_graph = tf_model_graph.get_concrete_function(
|
||||
tf.TensorSpec(tf_model.inputs[0].shape, tf_model.inputs[0].dtype))
|
||||
|
||||
# obtain frozen concrete function
|
||||
frozen_tf_func = convert_variables_to_constants_v2(tf_model_graph)
|
||||
# get frozen graph
|
||||
frozen_tf_func.graph.as_graph_def()
|
||||
|
||||
# save full tf model
|
||||
tf.io.write_graph(graph_or_graph_def=frozen_tf_func.graph,
|
||||
logdir=pb_model_path,
|
||||
name=pb_model_name,
|
||||
as_text=False)
|
||||
|
||||
return os.path.join(pb_model_path, pb_model_name)
|
||||
|
||||
|
||||
def get_preprocessed_img(img_path):
|
||||
# read the image
|
||||
input_img = cv2.imread(img_path, cv2.IMREAD_COLOR)
|
||||
input_img = input_img.astype(np.float32)
|
||||
|
||||
# define preprocess parameters
|
||||
mean = np.array([1.0, 1.0, 1.0]) * 127.5
|
||||
scale = 1 / 127.5
|
||||
|
||||
# prepare input blob to fit the model input:
|
||||
# 1. subtract mean
|
||||
# 2. scale to set pixel values from 0 to 1
|
||||
input_blob = cv2.dnn.blobFromImage(
|
||||
image=input_img,
|
||||
scalefactor=scale,
|
||||
size=(224, 224), # img target size
|
||||
mean=mean,
|
||||
swapRB=True, # BGR -> RGB
|
||||
crop=True # center crop
|
||||
)
|
||||
print("Input blob shape: {}\n".format(input_blob.shape))
|
||||
|
||||
return input_blob
|
||||
|
||||
|
||||
def get_imagenet_labels(labels_path):
|
||||
with open(labels_path) as f:
|
||||
imagenet_labels = [line.strip() for line in f.readlines()]
|
||||
return imagenet_labels
|
||||
|
||||
|
||||
def get_opencv_dnn_prediction(opencv_net, preproc_img, imagenet_labels):
|
||||
# set OpenCV DNN input
|
||||
opencv_net.setInput(preproc_img)
|
||||
|
||||
# OpenCV DNN inference
|
||||
out = opencv_net.forward()
|
||||
print("OpenCV DNN prediction: \n")
|
||||
print("* shape: ", out.shape)
|
||||
|
||||
# get the predicted class ID
|
||||
imagenet_class_id = np.argmax(out)
|
||||
|
||||
# get confidence
|
||||
confidence = out[0][imagenet_class_id]
|
||||
print("* class ID: {}, label: {}".format(imagenet_class_id, imagenet_labels[imagenet_class_id]))
|
||||
print("* confidence: {:.4f}\n".format(confidence))
|
||||
|
||||
|
||||
def get_tf_dnn_prediction(original_net, preproc_img, imagenet_labels):
|
||||
# inference
|
||||
preproc_img = preproc_img.transpose(0, 2, 3, 1)
|
||||
print("TF input blob shape: {}\n".format(preproc_img.shape))
|
||||
|
||||
out = original_net(preproc_img)
|
||||
|
||||
print("\nTensorFlow model prediction: \n")
|
||||
print("* shape: ", out.shape)
|
||||
|
||||
# get the predicted class ID
|
||||
imagenet_class_id = np.argmax(out)
|
||||
print("* class ID: {}, label: {}".format(imagenet_class_id, imagenet_labels[imagenet_class_id]))
|
||||
|
||||
# get confidence
|
||||
confidence = out[0][imagenet_class_id]
|
||||
print("* confidence: {:.4f}".format(confidence))
|
||||
|
||||
|
||||
def main():
|
||||
# configure TF launching
|
||||
set_tf_env()
|
||||
|
||||
# initialize TF MobileNet model
|
||||
original_tf_model = MobileNet(
|
||||
include_top=True,
|
||||
weights="imagenet"
|
||||
)
|
||||
|
||||
# get TF frozen graph path
|
||||
full_pb_path = get_tf_model_proto(original_tf_model)
|
||||
|
||||
# read frozen graph with OpenCV API
|
||||
opencv_net = cv2.dnn.readNetFromTensorflow(full_pb_path)
|
||||
print("OpenCV model was successfully read. Model layers: \n", opencv_net.getLayerNames())
|
||||
|
||||
# get preprocessed image
|
||||
input_img = get_preprocessed_img("../data/squirrel_cls.jpg")
|
||||
|
||||
# get ImageNet labels
|
||||
imagenet_labels = get_imagenet_labels("../data/dnn/classification_classes_ILSVRC2012.txt")
|
||||
|
||||
# obtain OpenCV DNN predictions
|
||||
get_opencv_dnn_prediction(opencv_net, input_img, imagenet_labels)
|
||||
|
||||
# obtain TF model predictions
|
||||
get_tf_dnn_prediction(original_tf_model, input_img, imagenet_labels)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,45 @@
|
||||
import os
|
||||
import tarfile
|
||||
import urllib
|
||||
|
||||
DETECTION_MODELS_URL = 'http://download.tensorflow.org/models/object_detection/'
|
||||
|
||||
|
||||
def extract_tf_frozen_graph(model_name, extracted_model_path):
|
||||
# define model archive name
|
||||
tf_model_tar = model_name + '.tar.gz'
|
||||
# define link to retrieve model archive
|
||||
model_link = DETECTION_MODELS_URL + tf_model_tar
|
||||
|
||||
tf_frozen_graph_name = 'frozen_inference_graph'
|
||||
|
||||
try:
|
||||
urllib.request.urlretrieve(model_link, tf_model_tar)
|
||||
except Exception:
|
||||
print("TF {} was not retrieved: {}".format(model_name, model_link))
|
||||
return
|
||||
|
||||
print("TF {} was retrieved.".format(model_name))
|
||||
|
||||
tf_model_tar = tarfile.open(tf_model_tar)
|
||||
frozen_graph_path = ""
|
||||
|
||||
for model_tar_elem in tf_model_tar.getmembers():
|
||||
if tf_frozen_graph_name in os.path.basename(model_tar_elem.name):
|
||||
tf_model_tar.extract(model_tar_elem, extracted_model_path)
|
||||
frozen_graph_path = os.path.join(extracted_model_path, model_tar_elem.name)
|
||||
break
|
||||
tf_model_tar.close()
|
||||
|
||||
return frozen_graph_path
|
||||
|
||||
|
||||
def main():
|
||||
tf_model_name = 'ssd_mobilenet_v1_coco_2017_11_17'
|
||||
graph_extraction_dir = "./"
|
||||
frozen_graph_path = extract_tf_frozen_graph(tf_model_name, graph_extraction_dir)
|
||||
print("Frozen graph path for {}: {}".format(tf_model_name, frozen_graph_path))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
112
samples/dnn/dnn_model_runner/dnn_conversion/tf/tf_model.py
Normal file
112
samples/dnn/dnn_model_runner/dnn_conversion/tf/tf_model.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import cv2
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2
|
||||
|
||||
from ..common.abstract_model import AbstractModel, Framework
|
||||
from ..common.utils import DNN_LIB, get_full_model_path
|
||||
|
||||
CURRENT_LIB = "TF"
|
||||
MODEL_FORMAT = ".pb"
|
||||
|
||||
|
||||
class TFModelPreparer(AbstractModel):
|
||||
""" Class for the preparation of the TF models: original and converted OpenCV Net.
|
||||
|
||||
Args:
|
||||
model_name: TF model name
|
||||
original_model: TF configured model object or session
|
||||
is_ready_graph: indicates whether ready .pb file already exists
|
||||
tf_model_graph_path: path to the existing frozen TF graph
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name="default",
|
||||
original_model=None,
|
||||
is_ready_graph=False,
|
||||
tf_model_graph_path=""
|
||||
):
|
||||
self._model_name = model_name
|
||||
self._original_model = original_model
|
||||
self._model_to_save = ""
|
||||
|
||||
self._is_ready_to_transfer_graph = is_ready_graph
|
||||
self.model_path = self._set_model_path(tf_model_graph_path)
|
||||
self._dnn_model = self._set_dnn_model()
|
||||
|
||||
def _set_dnn_model(self):
|
||||
if not self._is_ready_to_transfer_graph:
|
||||
# get model TF graph
|
||||
tf_model_graph = tf.function(lambda x: self._original_model(x))
|
||||
|
||||
tf_model_graph = tf_model_graph.get_concrete_function(
|
||||
tf.TensorSpec(self._original_model.inputs[0].shape, self._original_model.inputs[0].dtype))
|
||||
|
||||
# obtain frozen concrete function
|
||||
frozen_tf_func = convert_variables_to_constants_v2(tf_model_graph)
|
||||
frozen_tf_func.graph.as_graph_def()
|
||||
|
||||
# save full TF model
|
||||
tf.io.write_graph(graph_or_graph_def=frozen_tf_func.graph,
|
||||
logdir=self.model_path["path"],
|
||||
name=self._model_to_save,
|
||||
as_text=False)
|
||||
|
||||
return cv2.dnn.readNetFromTensorflow(self.model_path["full_path"])
|
||||
|
||||
def _set_model_path(self, tf_pb_file_path):
|
||||
""" Method for setting model paths.
|
||||
|
||||
Args:
|
||||
tf_pb_file_path: path to the existing TF .pb
|
||||
|
||||
Returns:
|
||||
dictionary, where full_path key means saved model path and its full name.
|
||||
"""
|
||||
model_paths_dict = {
|
||||
"path": "",
|
||||
"full_path": tf_pb_file_path
|
||||
}
|
||||
|
||||
if not self._is_ready_to_transfer_graph:
|
||||
self._model_to_save = self._model_name + MODEL_FORMAT
|
||||
model_paths_dict = get_full_model_path(CURRENT_LIB.lower(), self._model_to_save)
|
||||
|
||||
return model_paths_dict
|
||||
|
||||
def get_prepared_models(self):
|
||||
original_lib_name = CURRENT_LIB + " " + self._model_name
|
||||
configured_model_dict = {
|
||||
original_lib_name: self._original_model,
|
||||
DNN_LIB + " " + self._model_name: self._dnn_model
|
||||
}
|
||||
return configured_model_dict
|
||||
|
||||
|
||||
class TFModelProcessor(Framework):
|
||||
def __init__(self, prepared_model, model_name):
|
||||
self._prepared_model = prepared_model
|
||||
self._name = model_name
|
||||
|
||||
def get_output(self, input_blob):
|
||||
assert len(input_blob.shape) == 4
|
||||
batch_tf = input_blob.transpose(0, 2, 3, 1)
|
||||
out = self._prepared_model(batch_tf)
|
||||
return out
|
||||
|
||||
def get_name(self):
|
||||
return CURRENT_LIB
|
||||
|
||||
|
||||
class TFDnnModelProcessor(Framework):
|
||||
def __init__(self, prepared_dnn_model, model_name):
|
||||
self._prepared_dnn_model = prepared_dnn_model
|
||||
self._name = model_name
|
||||
|
||||
def get_output(self, input_blob):
|
||||
self._prepared_dnn_model.setInput(input_blob)
|
||||
ret_val = self._prepared_dnn_model.forward()
|
||||
return ret_val
|
||||
|
||||
def get_name(self):
|
||||
return DNN_LIB
|
||||
364
samples/dnn/download_models.py
Normal file
364
samples/dnn/download_models.py
Normal file
@@ -0,0 +1,364 @@
|
||||
'''
|
||||
Helper module to download extra data from Internet
|
||||
'''
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import cv2
|
||||
import sys
|
||||
import yaml
|
||||
import argparse
|
||||
import tarfile
|
||||
import platform
|
||||
import tempfile
|
||||
import hashlib
|
||||
import requests
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
if sys.version_info[0] < 3:
|
||||
from urllib2 import urlopen
|
||||
else:
|
||||
from urllib.request import urlopen
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
__all__ = ["downloadFile"]
|
||||
|
||||
class HashMismatchException(Exception):
|
||||
def __init__(self, expected, actual):
|
||||
Exception.__init__(self)
|
||||
self.expected = expected
|
||||
self.actual = actual
|
||||
def __str__(self):
|
||||
return 'Hash mismatch: expected {} vs actual of {}'.format(self.expected, self.actual)
|
||||
|
||||
def getHashsumFromFile(filepath):
|
||||
sha = hashlib.sha1()
|
||||
if os.path.exists(filepath):
|
||||
print(' there is already a file with the same name')
|
||||
with open(filepath, 'rb') as f:
|
||||
while True:
|
||||
buf = f.read(10*1024*1024)
|
||||
if not buf:
|
||||
break
|
||||
sha.update(buf)
|
||||
hashsum = sha.hexdigest()
|
||||
return hashsum
|
||||
|
||||
def checkHashsum(expected_sha, filepath, silent=True):
|
||||
print(' expected SHA1: {}'.format(expected_sha))
|
||||
actual_sha = getHashsumFromFile(filepath)
|
||||
print(' actual SHA1:{}'.format(actual_sha))
|
||||
hashes_matched = expected_sha == actual_sha
|
||||
if not hashes_matched and not silent:
|
||||
raise HashMismatchException(expected_sha, actual_sha)
|
||||
return hashes_matched
|
||||
|
||||
def isArchive(filepath):
|
||||
return tarfile.is_tarfile(filepath)
|
||||
|
||||
class DownloadInstance:
|
||||
def __init__(self, **kwargs):
|
||||
self.name = kwargs.pop('name')
|
||||
self.filename = kwargs.pop('filename')
|
||||
self.loader = kwargs.pop('loader', None)
|
||||
self.save_dir = kwargs.pop('save_dir')
|
||||
self.sha = kwargs.pop('sha', None)
|
||||
|
||||
def __str__(self):
|
||||
return 'DownloadInstance <{}>'.format(self.name)
|
||||
|
||||
def get(self):
|
||||
print(" Working on " + self.name)
|
||||
print(" Getting file " + self.filename)
|
||||
if self.sha is None:
|
||||
print(' No expected hashsum provided, loading file')
|
||||
else:
|
||||
filepath = os.path.join(self.save_dir, self.sha, self.filename)
|
||||
if checkHashsum(self.sha, filepath):
|
||||
print(' hash match - file already exists, skipping')
|
||||
return filepath
|
||||
else:
|
||||
print(' hash didn\'t match, loading file')
|
||||
|
||||
if not os.path.exists(self.save_dir):
|
||||
print(' creating directory: ' + self.save_dir)
|
||||
os.makedirs(self.save_dir)
|
||||
|
||||
|
||||
print(' hash check failed - loading')
|
||||
assert self.loader
|
||||
try:
|
||||
self.loader.load(self.filename, self.sha, self.save_dir)
|
||||
print(' done')
|
||||
print(' file {}'.format(self.filename))
|
||||
if self.sha is None:
|
||||
download_path = os.path.join(self.save_dir, self.filename)
|
||||
self.sha = getHashsumFromFile(download_path)
|
||||
new_dir = os.path.join(self.save_dir, self.sha)
|
||||
|
||||
if not os.path.exists(new_dir):
|
||||
os.makedirs(new_dir)
|
||||
filepath = os.path.join(new_dir, self.filename)
|
||||
if not (os.path.exists(filepath)):
|
||||
shutil.move(download_path, new_dir)
|
||||
print(' No expected hashsum provided, actual SHA is {}'.format(self.sha))
|
||||
else:
|
||||
checkHashsum(self.sha, filepath, silent=False)
|
||||
except Exception as e:
|
||||
print(" There was some problem with loading file {} for {}".format(self.filename, self.name))
|
||||
print(" Exception: {}".format(e))
|
||||
return
|
||||
|
||||
print(" Finished " + self.name)
|
||||
return filepath
|
||||
|
||||
class Loader(object):
|
||||
MB = 1024*1024
|
||||
BUFSIZE = 10*MB
|
||||
def __init__(self, download_name, download_sha, archive_member = None):
|
||||
self.download_name = download_name
|
||||
self.download_sha = download_sha
|
||||
self.archive_member = archive_member
|
||||
|
||||
def load(self, requested_file, sha, save_dir):
|
||||
if self.download_sha is None:
|
||||
download_dir = save_dir
|
||||
else:
|
||||
# create a new folder in save_dir to avoid possible name conflicts
|
||||
download_dir = os.path.join(save_dir, self.download_sha)
|
||||
if not os.path.exists(download_dir):
|
||||
os.makedirs(download_dir)
|
||||
download_path = os.path.join(download_dir, self.download_name)
|
||||
print(" Preparing to download file " + self.download_name)
|
||||
if checkHashsum(self.download_sha, download_path):
|
||||
print(' hash match - file already exists, no need to download')
|
||||
else:
|
||||
filesize = self.download(download_path)
|
||||
print(' Downloaded {} with size {} Mb'.format(self.download_name, filesize/self.MB))
|
||||
if self.download_sha is not None:
|
||||
checkHashsum(self.download_sha, download_path, silent=False)
|
||||
if self.download_name == requested_file:
|
||||
return
|
||||
else:
|
||||
if isArchive(download_path):
|
||||
if sha is not None:
|
||||
extract_dir = os.path.join(save_dir, sha)
|
||||
else:
|
||||
extract_dir = save_dir
|
||||
if not os.path.exists(extract_dir):
|
||||
os.makedirs(extract_dir)
|
||||
self.extract(requested_file, download_path, extract_dir)
|
||||
else:
|
||||
raise Exception("Downloaded file has different name")
|
||||
|
||||
def download(self, filepath):
|
||||
print("Warning: download is not implemented, this is a base class")
|
||||
return 0
|
||||
|
||||
def extract(self, requested_file, archive_path, save_dir):
|
||||
filepath = os.path.join(save_dir, requested_file)
|
||||
try:
|
||||
with tarfile.open(archive_path) as f:
|
||||
if self.archive_member is None:
|
||||
pathDict = dict((os.path.split(elem)[1], os.path.split(elem)[0]) for elem in f.getnames())
|
||||
self.archive_member = pathDict[requested_file]
|
||||
assert self.archive_member in f.getnames()
|
||||
self.save(filepath, f.extractfile(self.archive_member))
|
||||
except Exception as e:
|
||||
print(' catch {}'.format(e))
|
||||
|
||||
def save(self, filepath, r):
|
||||
with open(filepath, 'wb') as f:
|
||||
print(' progress ', end="")
|
||||
sys.stdout.flush()
|
||||
while True:
|
||||
buf = r.read(self.BUFSIZE)
|
||||
if not buf:
|
||||
break
|
||||
f.write(buf)
|
||||
print('>', end="")
|
||||
sys.stdout.flush()
|
||||
|
||||
class URLLoader(Loader):
|
||||
def __init__(self, download_name, download_sha, url, archive_member = None):
|
||||
super(URLLoader, self).__init__(download_name, download_sha, archive_member)
|
||||
self.download_name = download_name
|
||||
self.download_sha = download_sha
|
||||
self.url = url
|
||||
|
||||
def download(self, filepath):
|
||||
r = urlopen(self.url, timeout=60)
|
||||
self.printRequest(r)
|
||||
self.save(filepath, r)
|
||||
return os.path.getsize(filepath)
|
||||
|
||||
def printRequest(self, r):
|
||||
def getMB(r):
|
||||
d = dict(r.info())
|
||||
for c in ['content-length', 'Content-Length']:
|
||||
if c in d:
|
||||
return int(d[c]) / self.MB
|
||||
return '<unknown>'
|
||||
print(' {} {} [{} Mb]'.format(r.getcode(), r.msg, getMB(r)))
|
||||
|
||||
class GDriveLoader(Loader):
|
||||
BUFSIZE = 1024 * 1024
|
||||
PROGRESS_SIZE = 10 * 1024 * 1024
|
||||
def __init__(self, download_name, download_sha, gid, archive_member = None):
|
||||
super(GDriveLoader, self).__init__(download_name, download_sha, archive_member)
|
||||
self.download_name = download_name
|
||||
self.download_sha = download_sha
|
||||
self.gid = gid
|
||||
|
||||
def download(self, filepath):
|
||||
session = requests.Session() # re-use cookies
|
||||
|
||||
URL = "https://docs.google.com/uc?export=download"
|
||||
response = session.get(URL, params = { 'id' : self.gid }, stream = True)
|
||||
|
||||
def get_confirm_token(response): # in case of large files
|
||||
for key, value in response.cookies.items():
|
||||
if key.startswith('download_warning'):
|
||||
return value
|
||||
return None
|
||||
token = get_confirm_token(response)
|
||||
|
||||
if token:
|
||||
params = { 'id' : self.gid, 'confirm' : token }
|
||||
response = session.get(URL, params = params, stream = True)
|
||||
|
||||
sz = 0
|
||||
progress_sz = self.PROGRESS_SIZE
|
||||
with open(filepath, "wb") as f:
|
||||
for chunk in response.iter_content(self.BUFSIZE):
|
||||
if not chunk:
|
||||
continue # keep-alive
|
||||
|
||||
f.write(chunk)
|
||||
sz += len(chunk)
|
||||
if sz >= progress_sz:
|
||||
progress_sz += self.PROGRESS_SIZE
|
||||
print('>', end='')
|
||||
sys.stdout.flush()
|
||||
print('')
|
||||
return sz
|
||||
|
||||
def produceDownloadInstance(instance_name, filename, sha, url, save_dir, download_name=None, download_sha=None, archive_member=None):
|
||||
spec_param = url
|
||||
loader = URLLoader
|
||||
if download_name is None:
|
||||
download_name = filename
|
||||
if download_sha is None:
|
||||
download_sha = sha
|
||||
if "drive.google.com" in url:
|
||||
token = ""
|
||||
token_part = url.rsplit('/', 1)[-1]
|
||||
if "&id=" not in token_part:
|
||||
token_part = url.rsplit('/', 1)[-2]
|
||||
for param in token_part.split("&"):
|
||||
if param.startswith("id="):
|
||||
token = param[3:]
|
||||
if token:
|
||||
loader = GDriveLoader
|
||||
spec_param = token
|
||||
else:
|
||||
print("Warning: possibly wrong Google Drive link")
|
||||
return DownloadInstance(
|
||||
name=instance_name,
|
||||
filename=filename,
|
||||
sha=sha,
|
||||
save_dir=save_dir,
|
||||
loader=loader(download_name, download_sha, spec_param, archive_member)
|
||||
)
|
||||
|
||||
def getSaveDir():
|
||||
env_path = os.environ.get("OPENCV_DOWNLOAD_DATA_PATH", None)
|
||||
if env_path:
|
||||
save_dir = env_path
|
||||
else:
|
||||
# TODO reuse binding function cv2.utils.fs.getCacheDirectory when issue #19011 is fixed
|
||||
if platform.system() == "Darwin":
|
||||
#On Apple devices
|
||||
temp_env = os.environ.get("TMPDIR", None)
|
||||
if temp_env is None or not os.path.isdir(temp_env):
|
||||
temp_dir = Path("/tmp")
|
||||
print("Using world accessible cache directory. This may be not secure: ", temp_dir)
|
||||
else:
|
||||
temp_dir = temp_env
|
||||
elif platform.system() == "Windows":
|
||||
temp_dir = tempfile.gettempdir()
|
||||
else:
|
||||
xdg_cache_env = os.environ.get("XDG_CACHE_HOME", None)
|
||||
if (xdg_cache_env and xdg_cache_env[0] and os.path.isdir(xdg_cache_env)):
|
||||
temp_dir = xdg_cache_env
|
||||
else:
|
||||
home_env = os.environ.get("HOME", None)
|
||||
if (home_env and home_env[0] and os.path.isdir(home_env)):
|
||||
home_path = os.path.join(home_env, ".cache/")
|
||||
if os.path.isdir(home_path):
|
||||
temp_dir = home_path
|
||||
else:
|
||||
temp_dir = tempfile.gettempdir()
|
||||
print("Using world accessible cache directory. This may be not secure: ", temp_dir)
|
||||
|
||||
save_dir = os.path.join(temp_dir, "downloads")
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
return save_dir
|
||||
|
||||
def downloadFile(url, sha=None, save_dir=None, filename=None):
|
||||
if save_dir is None:
|
||||
save_dir = getSaveDir()
|
||||
if filename is None:
|
||||
filename = "download_" + datetime.now().__str__()
|
||||
name = filename
|
||||
return produceDownloadInstance(name, filename, sha, url, save_dir).get()
|
||||
|
||||
def parseMetalinkFile(metalink_filepath, save_dir):
|
||||
NS = {'ml': 'urn:ietf:params:xml:ns:metalink'}
|
||||
models = []
|
||||
for file_elem in ET.parse(metalink_filepath).getroot().findall('ml:file', NS):
|
||||
url = file_elem.find('ml:url', NS).text
|
||||
fname = file_elem.attrib['name']
|
||||
name = file_elem.find('ml:identity', NS).text
|
||||
hash_sum = file_elem.find('ml:hash', NS).text
|
||||
models.append(produceDownloadInstance(name, fname, hash_sum, url, save_dir))
|
||||
return models
|
||||
|
||||
def parseYAMLFile(yaml_filepath, save_dir):
|
||||
models = []
|
||||
with open(yaml_filepath, 'r') as stream:
|
||||
data_loaded = yaml.safe_load(stream)
|
||||
for name, params in data_loaded.items():
|
||||
load_info = params.get("load_info", None)
|
||||
if load_info:
|
||||
fname = os.path.basename(params.get("model"))
|
||||
hash_sum = load_info.get("sha1")
|
||||
url = load_info.get("url")
|
||||
download_sha = load_info.get("download_sha")
|
||||
download_name = load_info.get("download_name")
|
||||
archive_member = load_info.get("member")
|
||||
models.append(produceDownloadInstance(name, fname, hash_sum, url, save_dir,
|
||||
download_name=download_name, download_sha=download_sha, archive_member=archive_member))
|
||||
|
||||
return models
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='This is a utility script for downloading DNN models for samples.')
|
||||
|
||||
parser.add_argument('--save_dir', action="store", default=os.getcwd(),
|
||||
help='Path to the directory to store downloaded files')
|
||||
parser.add_argument('model_name', type=str, default="", nargs='?', action="store",
|
||||
help='name of the model to download')
|
||||
args = parser.parse_args()
|
||||
models = []
|
||||
save_dir = args.save_dir
|
||||
selected_model_name = args.model_name
|
||||
models.extend(parseMetalinkFile('face_detector/weights.meta4', save_dir))
|
||||
models.extend(parseYAMLFile('models.yml', save_dir))
|
||||
for m in models:
|
||||
print(m)
|
||||
if selected_model_name and not m.name.startswith(selected_model_name):
|
||||
continue
|
||||
print('Model: ' + selected_model_name)
|
||||
m.get()
|
||||
69
samples/dnn/edge_detection.py
Normal file
69
samples/dnn/edge_detection.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import cv2 as cv
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='This sample shows how to define custom OpenCV deep learning layers in Python. '
|
||||
'Holistically-Nested Edge Detection (https://arxiv.org/abs/1504.06375) neural network '
|
||||
'is used as an example model. Find a pre-trained model at https://github.com/s9xie/hed.')
|
||||
parser.add_argument('--input', help='Path to image or video. Skip to capture frames from camera')
|
||||
parser.add_argument('--prototxt', help='Path to deploy.prototxt', required=True)
|
||||
parser.add_argument('--caffemodel', help='Path to hed_pretrained_bsds.caffemodel', required=True)
|
||||
parser.add_argument('--width', help='Resize input image to a specific width', default=500, type=int)
|
||||
parser.add_argument('--height', help='Resize input image to a specific height', default=500, type=int)
|
||||
args = parser.parse_args()
|
||||
|
||||
#! [CropLayer]
|
||||
class CropLayer(object):
|
||||
def __init__(self, params, blobs):
|
||||
self.xstart = 0
|
||||
self.xend = 0
|
||||
self.ystart = 0
|
||||
self.yend = 0
|
||||
|
||||
# Our layer receives two inputs. We need to crop the first input blob
|
||||
# to match a shape of the second one (keeping batch size and number of channels)
|
||||
def getMemoryShapes(self, inputs):
|
||||
inputShape, targetShape = inputs[0], inputs[1]
|
||||
batchSize, numChannels = inputShape[0], inputShape[1]
|
||||
height, width = targetShape[2], targetShape[3]
|
||||
|
||||
self.ystart = (inputShape[2] - targetShape[2]) // 2
|
||||
self.xstart = (inputShape[3] - targetShape[3]) // 2
|
||||
self.yend = self.ystart + height
|
||||
self.xend = self.xstart + width
|
||||
|
||||
return [[batchSize, numChannels, height, width]]
|
||||
|
||||
def forward(self, inputs):
|
||||
return [inputs[0][:,:,self.ystart:self.yend,self.xstart:self.xend]]
|
||||
#! [CropLayer]
|
||||
|
||||
#! [Register]
|
||||
cv.dnn_registerLayer('Crop', CropLayer)
|
||||
#! [Register]
|
||||
|
||||
# Load the model.
|
||||
net = cv.dnn.readNet(cv.samples.findFile(args.prototxt), cv.samples.findFile(args.caffemodel))
|
||||
|
||||
kWinName = 'Holistically-Nested Edge Detection'
|
||||
cv.namedWindow('Input', cv.WINDOW_NORMAL)
|
||||
cv.namedWindow(kWinName, cv.WINDOW_NORMAL)
|
||||
|
||||
cap = cv.VideoCapture(args.input if args.input else 0)
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
cv.waitKey()
|
||||
break
|
||||
|
||||
cv.imshow('Input', frame)
|
||||
|
||||
inp = cv.dnn.blobFromImage(frame, scalefactor=1.0, size=(args.width, args.height),
|
||||
mean=(104.00698793, 116.66876762, 122.67891434),
|
||||
swapRB=False, crop=False)
|
||||
net.setInput(inp)
|
||||
|
||||
out = net.forward()
|
||||
out = out[0, 0]
|
||||
out = cv.resize(out, (frame.shape[1], frame.shape[0]))
|
||||
cv.imshow(kWinName, out)
|
||||
1790
samples/dnn/face_detector/deploy.prototxt
Normal file
1790
samples/dnn/face_detector/deploy.prototxt
Normal file
File diff suppressed because it is too large
Load Diff
1790
samples/dnn/face_detector/deploy_lowres.prototxt
Normal file
1790
samples/dnn/face_detector/deploy_lowres.prototxt
Normal file
File diff suppressed because it is too large
Load Diff
79
samples/dnn/face_detector/how_to_train_face_detector.txt
Normal file
79
samples/dnn/face_detector/how_to_train_face_detector.txt
Normal file
@@ -0,0 +1,79 @@
|
||||
This is a brief description of training process which has been used to get res10_300x300_ssd_iter_140000.caffemodel.
|
||||
The model was created with SSD framework using ResNet-10 like architecture as a backbone. Channels count in ResNet-10 convolution layers was significantly dropped (2x- or 4x- fewer channels).
|
||||
The model was trained in Caffe framework on some huge and available online dataset.
|
||||
|
||||
1. Prepare training tools
|
||||
You need to use "ssd" branch from this repository https://github.com/weiliu89/caffe/tree/ssd . Checkout this branch and built it (see instructions in repo's README)
|
||||
|
||||
2. Prepare training data.
|
||||
The data preparation pipeline can be represented as:
|
||||
|
||||
(a)Download original face detection dataset -> (b)Convert annotation to the PASCAL VOC format -> (c)Create LMDB database with images + annotations for training
|
||||
|
||||
a) Find some datasets with face bounding boxes annotation. For some reasons I can't provide links here, but you easily find them on your own. Also study the data. It may contain small or low quality faces which can spoil training process. Often there are special flags about object quality in annotation. Remove such faces from annotation (smaller when 16 along at least one side, or blurred, of highly-occluded, or something else).
|
||||
|
||||
b) The downloaded dataset will have some format of annotation. It may be one single file for all images, or separate file for each image or something else. But to train SSD in Caffe you need to convert annotation to PASCAL VOC format.
|
||||
PASCAL VOC annotation consist of .xml file for each image. In this xml file all face bounding boxes should be listed as:
|
||||
|
||||
<annotation>
|
||||
<size>
|
||||
<width>300</width>
|
||||
<height>300</height>
|
||||
</size>
|
||||
<object>
|
||||
<name>face</name>
|
||||
<difficult>0</difficult>
|
||||
<bndbox>
|
||||
<xmin>100</xmin>
|
||||
<ymin>100</ymin>
|
||||
<xmax>200</xmax>
|
||||
<ymax>200</ymax>
|
||||
</bndbox>
|
||||
</object>
|
||||
<object>
|
||||
<name>face</name>
|
||||
<difficult>0</difficult>
|
||||
<bndbox>
|
||||
<xmin>0</xmin>
|
||||
<ymin>0</ymin>
|
||||
<xmax>100</xmax>
|
||||
<ymax>100</ymax>
|
||||
</bndbox>
|
||||
</object>
|
||||
</annotation>
|
||||
|
||||
So, convert your dataset's annotation to the format above.
|
||||
Also, you should create labelmap.prototxt file with the following content:
|
||||
item {
|
||||
name: "none_of_the_above"
|
||||
label: 0
|
||||
display_name: "background"
|
||||
}
|
||||
item {
|
||||
name: "face"
|
||||
label: 1
|
||||
display_name: "face"
|
||||
}
|
||||
|
||||
You need this file to establish correspondence between name of class and digital label of class.
|
||||
|
||||
For next step we also need file there all our image-annotation file names pairs are listed. This file should contain similar lines:
|
||||
images_val/0.jpg annotations_val/0.jpg.xml
|
||||
|
||||
c) To create LMDB you need to use create_data.sh tool from caffe/data/VOC0712 Caffe's source code directory.
|
||||
This script calls create_annoset.py inside, so check out what you need to pass as script's arguments
|
||||
|
||||
You need to prepare 2 LMDB databases: one for training images, one for validation images.
|
||||
|
||||
3. Train your detector
|
||||
For training you need to have 3 files: train.prototxt, test.prototxt and solver.prototxt. You can find these files in the same directory as for this readme.
|
||||
Also you need to edit train.prototxt and test.prototxt to replace paths for your LMDB databases to actual databases you've created in step 2.
|
||||
|
||||
Now all is done for launch training process.
|
||||
Execute next lines in Terminal:
|
||||
mkdir -p snapshot
|
||||
mkdir -p log
|
||||
/path_for_caffe_build_dir/tools/caffe train -solver="solver.prototxt" -gpu 0 2>&1 | tee -a log/log.log
|
||||
|
||||
And wait. It will take about 8 hours to finish the process.
|
||||
After it you can use your .caffemodel from snapshot/ subdirectory in resnet_face_ssd_python.py sample.
|
||||
2368
samples/dnn/face_detector/opencv_face_detector.pbtxt
Normal file
2368
samples/dnn/face_detector/opencv_face_detector.pbtxt
Normal file
File diff suppressed because it is too large
Load Diff
28
samples/dnn/face_detector/solver.prototxt
Normal file
28
samples/dnn/face_detector/solver.prototxt
Normal file
@@ -0,0 +1,28 @@
|
||||
train_net: "train.prototxt"
|
||||
test_net: "test.prototxt"
|
||||
|
||||
test_iter: 2312
|
||||
test_interval: 5000
|
||||
test_initialization: true
|
||||
|
||||
base_lr: 0.01
|
||||
display: 10
|
||||
lr_policy: "multistep"
|
||||
max_iter: 140000
|
||||
stepvalue: 80000
|
||||
stepvalue: 120000
|
||||
gamma: 0.1
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0005
|
||||
average_loss: 500
|
||||
iter_size: 1
|
||||
type: "SGD"
|
||||
|
||||
solver_mode: GPU
|
||||
random_seed: 0
|
||||
debug_info: false
|
||||
snapshot: 1000
|
||||
snapshot_prefix: "snapshot/res10_300x300_ssd"
|
||||
|
||||
eval_type: "detection"
|
||||
ap_version: "11point"
|
||||
1831
samples/dnn/face_detector/test.prototxt
Normal file
1831
samples/dnn/face_detector/test.prototxt
Normal file
File diff suppressed because it is too large
Load Diff
1898
samples/dnn/face_detector/train.prototxt
Normal file
1898
samples/dnn/face_detector/train.prototxt
Normal file
File diff suppressed because it is too large
Load Diff
13
samples/dnn/face_detector/weights.meta4
Normal file
13
samples/dnn/face_detector/weights.meta4
Normal file
@@ -0,0 +1,13 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<metalink xmlns="urn:ietf:params:xml:ns:metalink">
|
||||
<file name="res10_300x300_ssd_iter_140000_fp16.caffemodel">
|
||||
<identity>opencv_face_detector_fp16</identity>
|
||||
<hash type="sha-1">31fc22bfdd907567a04bb45b7cfad29966caddc1</hash>
|
||||
<url>https://raw.githubusercontent.com/opencv/opencv_3rdparty/dnn_samples_face_detector_20180205_fp16/res10_300x300_ssd_iter_140000_fp16.caffemodel</url>
|
||||
</file>
|
||||
<file name="opencv_face_detector_uint8.pb">
|
||||
<identity>opencv_face_detector_uint8</identity>
|
||||
<hash type="sha-1">4f2fdf6f231d759d7bbdb94353c5a68690f3d2ae</hash>
|
||||
<url>https://raw.githubusercontent.com/opencv/opencv_3rdparty/dnn_samples_face_detector_20180220_uint8/opencv_face_detector_uint8.pb</url>
|
||||
</file>
|
||||
</metalink>
|
||||
53
samples/dnn/fast_neural_style.py
Normal file
53
samples/dnn/fast_neural_style.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from __future__ import print_function
|
||||
import cv2 as cv
|
||||
import numpy as np
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='This script is used to run style transfer models from '
|
||||
'https://github.com/jcjohnson/fast-neural-style using OpenCV')
|
||||
parser.add_argument('--input', help='Path to image or video. Skip to capture frames from camera')
|
||||
parser.add_argument('--model', help='Path to .t7 model')
|
||||
parser.add_argument('--width', default=-1, type=int, help='Resize input to specific width.')
|
||||
parser.add_argument('--height', default=-1, type=int, help='Resize input to specific height.')
|
||||
parser.add_argument('--median_filter', default=0, type=int, help='Kernel size of postprocessing blurring.')
|
||||
args = parser.parse_args()
|
||||
|
||||
net = cv.dnn.readNetFromTorch(cv.samples.findFile(args.model))
|
||||
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
|
||||
|
||||
if args.input:
|
||||
cap = cv.VideoCapture(args.input)
|
||||
else:
|
||||
cap = cv.VideoCapture(0)
|
||||
|
||||
cv.namedWindow('Styled image', cv.WINDOW_NORMAL)
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
cv.waitKey()
|
||||
break
|
||||
|
||||
inWidth = args.width if args.width != -1 else frame.shape[1]
|
||||
inHeight = args.height if args.height != -1 else frame.shape[0]
|
||||
inp = cv.dnn.blobFromImage(frame, 1.0, (inWidth, inHeight),
|
||||
(103.939, 116.779, 123.68), swapRB=False, crop=False)
|
||||
|
||||
net.setInput(inp)
|
||||
out = net.forward()
|
||||
|
||||
out = out.reshape(3, out.shape[2], out.shape[3])
|
||||
out[0] += 103.939
|
||||
out[1] += 116.779
|
||||
out[2] += 123.68
|
||||
out /= 255
|
||||
out = out.transpose(1, 2, 0)
|
||||
|
||||
t, _ = net.getPerfProfile()
|
||||
freq = cv.getTickFrequency() / 1000
|
||||
print(t / freq, 'ms')
|
||||
|
||||
if args.median_filter:
|
||||
out = cv.medianBlur(out, args.median_filter)
|
||||
|
||||
cv.imshow('Styled image', out)
|
||||
104
samples/dnn/human_parsing.cpp
Normal file
104
samples/dnn/human_parsing.cpp
Normal file
@@ -0,0 +1,104 @@
|
||||
//
|
||||
// this sample demonstrates parsing (segmenting) human body parts from an image using opencv's dnn,
|
||||
// based on https://github.com/Engineering-Course/LIP_JPPNet
|
||||
//
|
||||
// get the pretrained model from: https://www.dropbox.com/s/qag9vzambhhkvxr/lip_jppnet_384.pb?dl=0
|
||||
//
|
||||
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
using namespace cv;
|
||||
|
||||
|
||||
static Mat parse_human(const Mat &image, const std::string &model, int backend=dnn::DNN_BACKEND_DEFAULT, int target=dnn::DNN_TARGET_CPU) {
|
||||
// this network expects an image and a flipped copy as input
|
||||
Mat flipped;
|
||||
flip(image, flipped, 1);
|
||||
std::vector<Mat> batch;
|
||||
batch.push_back(image);
|
||||
batch.push_back(flipped);
|
||||
Mat blob = dnn::blobFromImages(batch, 1.0, Size(), Scalar(104.00698793, 116.66876762, 122.67891434));
|
||||
|
||||
dnn::Net net = dnn::readNet(model);
|
||||
net.setPreferableBackend(backend);
|
||||
net.setPreferableTarget(target);
|
||||
net.setInput(blob);
|
||||
Mat out = net.forward();
|
||||
// expected output: [2, 20, 384, 384], (2 lists(orig, flipped) of 20 body part heatmaps 384x384)
|
||||
|
||||
// LIP classes:
|
||||
// 0 Background, 1 Hat, 2 Hair, 3 Glove, 4 Sunglasses, 5 UpperClothes, 6 Dress, 7 Coat, 8 Socks, 9 Pants
|
||||
// 10 Jumpsuits, 11 Scarf, 12 Skirt, 13 Face, 14 LeftArm, 15 RightArm, 16 LeftLeg, 17 RightLeg, 18 LeftShoe. 19 RightShoe
|
||||
Vec3b colors[] = {
|
||||
Vec3b(0, 0, 0), Vec3b(128, 0, 0), Vec3b(255, 0, 0), Vec3b(0, 85, 0), Vec3b(170, 0, 51), Vec3b(255, 85, 0),
|
||||
Vec3b(0, 0, 85), Vec3b(0, 119, 221), Vec3b(85, 85, 0), Vec3b(0, 85, 85), Vec3b(85, 51, 0), Vec3b(52, 86, 128),
|
||||
Vec3b(0, 128, 0), Vec3b(0, 0, 255), Vec3b(51, 170, 221), Vec3b(0, 255, 255), Vec3b(85, 255, 170),
|
||||
Vec3b(170, 255, 85), Vec3b(255, 255, 0), Vec3b(255, 170, 0)
|
||||
};
|
||||
|
||||
Mat segm(image.size(), CV_8UC3, Scalar(0,0,0));
|
||||
Mat maxval(image.size(), CV_32F, Scalar(0));
|
||||
|
||||
// iterate over body part heatmaps (LIP classes)
|
||||
for (int i=0; i<out.size[1]; i++) {
|
||||
// resize heatmaps to original image size
|
||||
// "head" is the original image result, "tail" the flipped copy
|
||||
Mat head, h(out.size[2], out.size[3], CV_32F, out.ptr<float>(0,i));
|
||||
resize(h, head, image.size());
|
||||
|
||||
// we have to swap the last 3 pairs in the "tail" list
|
||||
static int tail_order[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,14,17,16,19,18};
|
||||
Mat tail, t(out.size[2], out.size[3], CV_32F, out.ptr<float>(1,tail_order[i]));
|
||||
resize(t, tail, image.size());
|
||||
flip(tail, tail, 1);
|
||||
|
||||
// mix original and flipped result
|
||||
Mat avg = (head + tail) * 0.5;
|
||||
|
||||
// write color if prob value > maxval
|
||||
Mat cmask;
|
||||
compare(avg, maxval, cmask, CMP_GT);
|
||||
segm.setTo(colors[i], cmask);
|
||||
|
||||
// keep largest values for next iteration
|
||||
max(avg, maxval, maxval);
|
||||
}
|
||||
cvtColor(segm, segm, COLOR_RGB2BGR);
|
||||
return segm;
|
||||
}
|
||||
|
||||
int main(int argc, char**argv)
|
||||
{
|
||||
CommandLineParser parser(argc,argv,
|
||||
"{help h | | show help screen / args}"
|
||||
"{image i | | person image to process }"
|
||||
"{model m |lip_jppnet_384.pb| network model}"
|
||||
"{backend b | 0 | Choose one of computation backends: "
|
||||
"0: automatically (by default), "
|
||||
"1: Halide language (http://halide-lang.org/), "
|
||||
"2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"3: OpenCV implementation }"
|
||||
"{target t | 0 | Choose one of target computation devices: "
|
||||
"0: CPU target (by default), "
|
||||
"1: OpenCL, "
|
||||
"2: OpenCL fp16 (half-float precision), "
|
||||
"3: VPU }"
|
||||
);
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
std::string model = parser.get<std::string>("model");
|
||||
std::string image = parser.get<std::string>("image");
|
||||
int backend = parser.get<int>("backend");
|
||||
int target = parser.get<int>("target");
|
||||
|
||||
Mat input = imread(image);
|
||||
Mat segm = parse_human(input, model, backend, target);
|
||||
|
||||
imshow("human parsing", segm);
|
||||
waitKey();
|
||||
return 0;
|
||||
}
|
||||
183
samples/dnn/human_parsing.py
Normal file
183
samples/dnn/human_parsing.py
Normal file
@@ -0,0 +1,183 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
You can download the converted pb model from https://www.dropbox.com/s/qag9vzambhhkvxr/lip_jppnet_384.pb?dl=0
|
||||
or convert the model yourself.
|
||||
|
||||
Follow these steps if you want to convert the original model yourself:
|
||||
To get original .meta pre-trained model download https://drive.google.com/file/d/1BFVXgeln-bek8TCbRjN6utPAgRE0LJZg/view
|
||||
For correct convert .meta to .pb model download original repository https://github.com/Engineering-Course/LIP_JPPNet
|
||||
Change script evaluate_parsing_JPPNet-s2.py for human parsing
|
||||
1. Remove preprocessing to create image_batch_origin:
|
||||
with tf.name_scope("create_inputs"):
|
||||
...
|
||||
Add
|
||||
image_batch_origin = tf.placeholder(tf.float32, shape=(2, None, None, 3), name='input')
|
||||
|
||||
2. Create input
|
||||
image = cv2.imread(path/to/image)
|
||||
image_rev = np.flip(image, axis=1)
|
||||
input = np.stack([image, image_rev], axis=0)
|
||||
|
||||
3. Hardcode image_h and image_w shapes to determine output shapes.
|
||||
We use default INPUT_SIZE = (384, 384) from evaluate_parsing_JPPNet-s2.py.
|
||||
parsing_out1 = tf.reduce_mean(tf.stack([tf.image.resize_images(parsing_out1_100, INPUT_SIZE),
|
||||
tf.image.resize_images(parsing_out1_075, INPUT_SIZE),
|
||||
tf.image.resize_images(parsing_out1_125, INPUT_SIZE)]), axis=0)
|
||||
Do similarly with parsing_out2, parsing_out3
|
||||
4. Remove postprocessing. Last net operation:
|
||||
raw_output = tf.reduce_mean(tf.stack([parsing_out1, parsing_out2, parsing_out3]), axis=0)
|
||||
Change:
|
||||
parsing_ = sess.run(raw_output, feed_dict={'input:0': input})
|
||||
|
||||
5. To save model after sess.run(...) add:
|
||||
input_graph_def = tf.get_default_graph().as_graph_def()
|
||||
output_node = "Mean_3"
|
||||
output_graph_def = tf.graph_util.convert_variables_to_constants(sess, input_graph_def, output_node)
|
||||
|
||||
output_graph = "LIP_JPPNet.pb"
|
||||
with tf.gfile.GFile(output_graph, "wb") as f:
|
||||
f.write(output_graph_def.SerializeToString())'
|
||||
'''
|
||||
|
||||
import argparse
|
||||
import os.path
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
|
||||
|
||||
backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV)
|
||||
targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL)
|
||||
|
||||
|
||||
def preprocess(image):
|
||||
"""
|
||||
Create 4-dimensional blob from image and flip image
|
||||
:param image: input image
|
||||
"""
|
||||
image_rev = np.flip(image, axis=1)
|
||||
input = cv.dnn.blobFromImages([image, image_rev], mean=(104.00698793, 116.66876762, 122.67891434))
|
||||
return input
|
||||
|
||||
|
||||
def run_net(input, model_path, backend, target):
|
||||
"""
|
||||
Read network and infer model
|
||||
:param model_path: path to JPPNet model
|
||||
:param backend: computation backend
|
||||
:param target: computation device
|
||||
"""
|
||||
net = cv.dnn.readNet(model_path)
|
||||
net.setPreferableBackend(backend)
|
||||
net.setPreferableTarget(target)
|
||||
net.setInput(input)
|
||||
out = net.forward()
|
||||
return out
|
||||
|
||||
|
||||
def postprocess(out, input_shape):
|
||||
"""
|
||||
Create a grayscale human segmentation
|
||||
:param out: network output
|
||||
:param input_shape: input image width and height
|
||||
"""
|
||||
# LIP classes
|
||||
# 0 Background
|
||||
# 1 Hat
|
||||
# 2 Hair
|
||||
# 3 Glove
|
||||
# 4 Sunglasses
|
||||
# 5 UpperClothes
|
||||
# 6 Dress
|
||||
# 7 Coat
|
||||
# 8 Socks
|
||||
# 9 Pants
|
||||
# 10 Jumpsuits
|
||||
# 11 Scarf
|
||||
# 12 Skirt
|
||||
# 13 Face
|
||||
# 14 LeftArm
|
||||
# 15 RightArm
|
||||
# 16 LeftLeg
|
||||
# 17 RightLeg
|
||||
# 18 LeftShoe
|
||||
# 19 RightShoe
|
||||
head_output, tail_output = np.split(out, indices_or_sections=[1], axis=0)
|
||||
head_output = head_output.squeeze(0)
|
||||
tail_output = tail_output.squeeze(0)
|
||||
|
||||
head_output = np.stack([cv.resize(img, dsize=input_shape) for img in head_output[:, ...]])
|
||||
tail_output = np.stack([cv.resize(img, dsize=input_shape) for img in tail_output[:, ...]])
|
||||
|
||||
tail_list = np.split(tail_output, indices_or_sections=list(range(1, 20)), axis=0)
|
||||
tail_list = [arr.squeeze(0) for arr in tail_list]
|
||||
tail_list_rev = [tail_list[i] for i in range(14)]
|
||||
tail_list_rev.extend([tail_list[15], tail_list[14], tail_list[17], tail_list[16], tail_list[19], tail_list[18]])
|
||||
tail_output_rev = np.stack(tail_list_rev, axis=0)
|
||||
tail_output_rev = np.flip(tail_output_rev, axis=2)
|
||||
raw_output_all = np.mean(np.stack([head_output, tail_output_rev], axis=0), axis=0, keepdims=True)
|
||||
raw_output_all = np.argmax(raw_output_all, axis=1)
|
||||
raw_output_all = raw_output_all.transpose(1, 2, 0)
|
||||
return raw_output_all
|
||||
|
||||
|
||||
def decode_labels(gray_image):
|
||||
"""
|
||||
Colorize image according to labels
|
||||
:param gray_image: grayscale human segmentation result
|
||||
"""
|
||||
height, width, _ = gray_image.shape
|
||||
colors = [(0, 0, 0), (128, 0, 0), (255, 0, 0), (0, 85, 0), (170, 0, 51), (255, 85, 0),
|
||||
(0, 0, 85), (0, 119, 221), (85, 85, 0), (0, 85, 85), (85, 51, 0), (52, 86, 128),
|
||||
(0, 128, 0), (0, 0, 255), (51, 170, 221), (0, 255, 255),(85, 255, 170),
|
||||
(170, 255, 85), (255, 255, 0), (255, 170, 0)]
|
||||
|
||||
segm = np.stack([colors[idx] for idx in gray_image.flatten()])
|
||||
segm = segm.reshape(height, width, 3).astype(np.uint8)
|
||||
segm = cv.cvtColor(segm, cv.COLOR_BGR2RGB)
|
||||
return segm
|
||||
|
||||
|
||||
def parse_human(image, model_path, backend=cv.dnn.DNN_BACKEND_OPENCV, target=cv.dnn.DNN_TARGET_CPU):
|
||||
"""
|
||||
Prepare input for execution, run net and postprocess output to parse human.
|
||||
:param image: input image
|
||||
:param model_path: path to JPPNet model
|
||||
:param backend: name of computation backend
|
||||
:param target: name of computation target
|
||||
"""
|
||||
input = preprocess(image)
|
||||
input_h, input_w = input.shape[2:]
|
||||
output = run_net(input, model_path, backend, target)
|
||||
grayscale_out = postprocess(output, (input_w, input_h))
|
||||
segmentation = decode_labels(grayscale_out)
|
||||
return segmentation
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Use this script to run human parsing using JPPNet',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--input', '-i', required=True, help='Path to input image.')
|
||||
parser.add_argument('--model', '-m', default='lip_jppnet_384.pb', help='Path to pb model.')
|
||||
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
|
||||
help="Choose one of computation backends: "
|
||||
"%d: automatically (by default), "
|
||||
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"%d: OpenCV implementation" % backends)
|
||||
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
|
||||
help='Choose one of target computation devices: '
|
||||
'%d: CPU target (by default), '
|
||||
'%d: OpenCL, '
|
||||
'%d: OpenCL fp16 (half-float precision), '
|
||||
'%d: NCS2 VPU, '
|
||||
'%d: HDDL VPU' % targets)
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
if not os.path.isfile(args.model):
|
||||
raise OSError("Model not exist")
|
||||
|
||||
image = cv.imread(args.input)
|
||||
output = parse_human(image, args.model, args.backend, args.target)
|
||||
winName = 'Deep learning human parsing in OpenCV'
|
||||
cv.namedWindow(winName, cv.WINDOW_AUTOSIZE)
|
||||
cv.imshow(winName, output)
|
||||
cv.waitKey()
|
||||
205
samples/dnn/js_face_recognition.html
Normal file
205
samples/dnn/js_face_recognition.html
Normal file
@@ -0,0 +1,205 @@
|
||||
<!DOCTYPE html>
|
||||
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<script async src="../../opencv.js" type="text/javascript"></script>
|
||||
<script src="../../utils.js" type="text/javascript"></script>
|
||||
|
||||
<script type='text/javascript'>
|
||||
var netDet = undefined, netRecogn = undefined;
|
||||
var persons = {};
|
||||
|
||||
//! [Run face detection model]
|
||||
function detectFaces(img) {
|
||||
var blob = cv.blobFromImage(img, 1, {width: 192, height: 144}, [104, 117, 123, 0], false, false);
|
||||
netDet.setInput(blob);
|
||||
var out = netDet.forward();
|
||||
|
||||
var faces = [];
|
||||
for (var i = 0, n = out.data32F.length; i < n; i += 7) {
|
||||
var confidence = out.data32F[i + 2];
|
||||
var left = out.data32F[i + 3] * img.cols;
|
||||
var top = out.data32F[i + 4] * img.rows;
|
||||
var right = out.data32F[i + 5] * img.cols;
|
||||
var bottom = out.data32F[i + 6] * img.rows;
|
||||
left = Math.min(Math.max(0, left), img.cols - 1);
|
||||
right = Math.min(Math.max(0, right), img.cols - 1);
|
||||
bottom = Math.min(Math.max(0, bottom), img.rows - 1);
|
||||
top = Math.min(Math.max(0, top), img.rows - 1);
|
||||
|
||||
if (confidence > 0.5 && left < right && top < bottom) {
|
||||
faces.push({x: left, y: top, width: right - left, height: bottom - top})
|
||||
}
|
||||
}
|
||||
blob.delete();
|
||||
out.delete();
|
||||
return faces;
|
||||
};
|
||||
//! [Run face detection model]
|
||||
|
||||
//! [Get 128 floating points feature vector]
|
||||
function face2vec(face) {
|
||||
var blob = cv.blobFromImage(face, 1.0 / 255, {width: 96, height: 96}, [0, 0, 0, 0], true, false)
|
||||
netRecogn.setInput(blob);
|
||||
var vec = netRecogn.forward();
|
||||
blob.delete();
|
||||
return vec;
|
||||
};
|
||||
//! [Get 128 floating points feature vector]
|
||||
|
||||
//! [Recognize]
|
||||
function recognize(face) {
|
||||
var vec = face2vec(face);
|
||||
|
||||
var bestMatchName = 'unknown';
|
||||
var bestMatchScore = 0.5; // Actually, the minimum is -1 but we use it as a threshold.
|
||||
for (name in persons) {
|
||||
var personVec = persons[name];
|
||||
var score = vec.dot(personVec);
|
||||
if (score > bestMatchScore) {
|
||||
bestMatchScore = score;
|
||||
bestMatchName = name;
|
||||
}
|
||||
}
|
||||
vec.delete();
|
||||
return bestMatchName;
|
||||
};
|
||||
//! [Recognize]
|
||||
|
||||
function loadModels(callback) {
|
||||
var utils = new Utils('');
|
||||
var proto = 'https://raw.githubusercontent.com/opencv/opencv/master/samples/dnn/face_detector/deploy_lowres.prototxt';
|
||||
var weights = 'https://raw.githubusercontent.com/opencv/opencv_3rdparty/dnn_samples_face_detector_20180205_fp16/res10_300x300_ssd_iter_140000_fp16.caffemodel';
|
||||
var recognModel = 'https://raw.githubusercontent.com/pyannote/pyannote-data/master/openface.nn4.small2.v1.t7';
|
||||
utils.createFileFromUrl('face_detector.prototxt', proto, () => {
|
||||
document.getElementById('status').innerHTML = 'Downloading face_detector.caffemodel';
|
||||
utils.createFileFromUrl('face_detector.caffemodel', weights, () => {
|
||||
document.getElementById('status').innerHTML = 'Downloading OpenFace model';
|
||||
utils.createFileFromUrl('face_recognition.t7', recognModel, () => {
|
||||
document.getElementById('status').innerHTML = '';
|
||||
netDet = cv.readNetFromCaffe('face_detector.prototxt', 'face_detector.caffemodel');
|
||||
netRecogn = cv.readNetFromTorch('face_recognition.t7');
|
||||
callback();
|
||||
});
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
function main() {
|
||||
// Create a camera object.
|
||||
var output = document.getElementById('output');
|
||||
var camera = document.createElement("video");
|
||||
camera.setAttribute("width", output.width);
|
||||
camera.setAttribute("height", output.height);
|
||||
|
||||
// Get a permission from user to use a camera.
|
||||
navigator.mediaDevices.getUserMedia({video: true, audio: false})
|
||||
.then(function(stream) {
|
||||
camera.srcObject = stream;
|
||||
camera.onloadedmetadata = function(e) {
|
||||
camera.play();
|
||||
};
|
||||
});
|
||||
|
||||
//! [Open a camera stream]
|
||||
var cap = new cv.VideoCapture(camera);
|
||||
var frame = new cv.Mat(camera.height, camera.width, cv.CV_8UC4);
|
||||
var frameBGR = new cv.Mat(camera.height, camera.width, cv.CV_8UC3);
|
||||
//! [Open a camera stream]
|
||||
|
||||
//! [Add a person]
|
||||
document.getElementById('addPersonButton').onclick = function() {
|
||||
var rects = detectFaces(frameBGR);
|
||||
if (rects.length > 0) {
|
||||
var face = frameBGR.roi(rects[0]);
|
||||
|
||||
var name = prompt('Say your name:');
|
||||
var cell = document.getElementById("targetNames").insertCell(0);
|
||||
cell.innerHTML = name;
|
||||
|
||||
persons[name] = face2vec(face).clone();
|
||||
|
||||
var canvas = document.createElement("canvas");
|
||||
canvas.setAttribute("width", 96);
|
||||
canvas.setAttribute("height", 96);
|
||||
var cell = document.getElementById("targetImgs").insertCell(0);
|
||||
cell.appendChild(canvas);
|
||||
|
||||
var faceResized = new cv.Mat(canvas.height, canvas.width, cv.CV_8UC3);
|
||||
cv.resize(face, faceResized, {width: canvas.width, height: canvas.height});
|
||||
cv.cvtColor(faceResized, faceResized, cv.COLOR_BGR2RGB);
|
||||
cv.imshow(canvas, faceResized);
|
||||
faceResized.delete();
|
||||
}
|
||||
};
|
||||
//! [Add a person]
|
||||
|
||||
//! [Define frames processing]
|
||||
var isRunning = false;
|
||||
const FPS = 30; // Target number of frames processed per second.
|
||||
function captureFrame() {
|
||||
var begin = Date.now();
|
||||
cap.read(frame); // Read a frame from camera
|
||||
cv.cvtColor(frame, frameBGR, cv.COLOR_RGBA2BGR);
|
||||
|
||||
var faces = detectFaces(frameBGR);
|
||||
faces.forEach(function(rect) {
|
||||
cv.rectangle(frame, {x: rect.x, y: rect.y}, {x: rect.x + rect.width, y: rect.y + rect.height}, [0, 255, 0, 255]);
|
||||
|
||||
var face = frameBGR.roi(rect);
|
||||
var name = recognize(face);
|
||||
cv.putText(frame, name, {x: rect.x, y: rect.y}, cv.FONT_HERSHEY_SIMPLEX, 1.0, [0, 255, 0, 255]);
|
||||
});
|
||||
|
||||
cv.imshow(output, frame);
|
||||
|
||||
// Loop this function.
|
||||
if (isRunning) {
|
||||
var delay = 1000 / FPS - (Date.now() - begin);
|
||||
setTimeout(captureFrame, delay);
|
||||
}
|
||||
};
|
||||
//! [Define frames processing]
|
||||
|
||||
document.getElementById('startStopButton').onclick = function toggle() {
|
||||
if (isRunning) {
|
||||
isRunning = false;
|
||||
document.getElementById('startStopButton').innerHTML = 'Start';
|
||||
document.getElementById('addPersonButton').disabled = true;
|
||||
} else {
|
||||
function run() {
|
||||
isRunning = true;
|
||||
captureFrame();
|
||||
document.getElementById('startStopButton').innerHTML = 'Stop';
|
||||
document.getElementById('startStopButton').disabled = false;
|
||||
document.getElementById('addPersonButton').disabled = false;
|
||||
}
|
||||
if (netDet == undefined || netRecogn == undefined) {
|
||||
document.getElementById('startStopButton').disabled = true;
|
||||
loadModels(run); // Load models and run a pipeline;
|
||||
} else {
|
||||
run();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
document.getElementById('startStopButton').disabled = false;
|
||||
};
|
||||
</script>
|
||||
|
||||
</head>
|
||||
|
||||
<body onload="cv['onRuntimeInitialized']=()=>{ main() }">
|
||||
<button id="startStopButton" type="button" disabled="true">Start</button>
|
||||
<div id="status"></div>
|
||||
<canvas id="output" width=640 height=480 style="max-width: 100%"></canvas>
|
||||
|
||||
<table>
|
||||
<tr id="targetImgs"></tr>
|
||||
<tr id="targetNames"></tr>
|
||||
</table>
|
||||
<button id="addPersonButton" type="button" disabled="true">Add a person</button>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
143
samples/dnn/mask_rcnn.py
Normal file
143
samples/dnn/mask_rcnn.py
Normal file
@@ -0,0 +1,143 @@
|
||||
import cv2 as cv
|
||||
import argparse
|
||||
import numpy as np
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
'Use this script to run Mask-RCNN object detection and semantic '
|
||||
'segmentation network from TensorFlow Object Detection API.')
|
||||
parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--model', required=True, help='Path to a .pb file with weights.')
|
||||
parser.add_argument('--config', required=True, help='Path to a .pxtxt file contains network configuration.')
|
||||
parser.add_argument('--classes', help='Optional path to a text file with names of classes.')
|
||||
parser.add_argument('--colors', help='Optional path to a text file with colors for an every class. '
|
||||
'An every color is represented with three values from 0 to 255 in BGR channels order.')
|
||||
parser.add_argument('--width', type=int, default=800,
|
||||
help='Preprocess input image by resizing to a specific width.')
|
||||
parser.add_argument('--height', type=int, default=800,
|
||||
help='Preprocess input image by resizing to a specific height.')
|
||||
parser.add_argument('--thr', type=float, default=0.5, help='Confidence threshold')
|
||||
args = parser.parse_args()
|
||||
|
||||
np.random.seed(324)
|
||||
|
||||
# Load names of classes
|
||||
classes = None
|
||||
if args.classes:
|
||||
with open(args.classes, 'rt') as f:
|
||||
classes = f.read().rstrip('\n').split('\n')
|
||||
|
||||
# Load colors
|
||||
colors = None
|
||||
if args.colors:
|
||||
with open(args.colors, 'rt') as f:
|
||||
colors = [np.array(color.split(' '), np.uint8) for color in f.read().rstrip('\n').split('\n')]
|
||||
|
||||
legend = None
|
||||
def showLegend(classes):
|
||||
global legend
|
||||
if not classes is None and legend is None:
|
||||
blockHeight = 30
|
||||
assert(len(classes) == len(colors))
|
||||
|
||||
legend = np.zeros((blockHeight * len(colors), 200, 3), np.uint8)
|
||||
for i in range(len(classes)):
|
||||
block = legend[i * blockHeight:(i + 1) * blockHeight]
|
||||
block[:,:] = colors[i]
|
||||
cv.putText(block, classes[i], (0, blockHeight//2), cv.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
|
||||
|
||||
cv.namedWindow('Legend', cv.WINDOW_NORMAL)
|
||||
cv.imshow('Legend', legend)
|
||||
classes = None
|
||||
|
||||
|
||||
def drawBox(frame, classId, conf, left, top, right, bottom):
|
||||
# Draw a bounding box.
|
||||
cv.rectangle(frame, (left, top), (right, bottom), (0, 255, 0))
|
||||
|
||||
label = '%.2f' % conf
|
||||
|
||||
# Print a label of class.
|
||||
if classes:
|
||||
assert(classId < len(classes))
|
||||
label = '%s: %s' % (classes[classId], label)
|
||||
|
||||
labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
|
||||
top = max(top, labelSize[1])
|
||||
cv.rectangle(frame, (left, top - labelSize[1]), (left + labelSize[0], top + baseLine), (255, 255, 255), cv.FILLED)
|
||||
cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
|
||||
|
||||
|
||||
# Load a network
|
||||
net = cv.dnn.readNet(cv.samples.findFile(args.model), cv.samples.findFile(args.config))
|
||||
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
|
||||
|
||||
winName = 'Mask-RCNN in OpenCV'
|
||||
cv.namedWindow(winName, cv.WINDOW_NORMAL)
|
||||
|
||||
cap = cv.VideoCapture(cv.samples.findFileOrKeep(args.input) if args.input else 0)
|
||||
legend = None
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
cv.waitKey()
|
||||
break
|
||||
|
||||
frameH = frame.shape[0]
|
||||
frameW = frame.shape[1]
|
||||
|
||||
# Create a 4D blob from a frame.
|
||||
blob = cv.dnn.blobFromImage(frame, size=(args.width, args.height), swapRB=True, crop=False)
|
||||
|
||||
# Run a model
|
||||
net.setInput(blob)
|
||||
|
||||
boxes, masks = net.forward(['detection_out_final', 'detection_masks'])
|
||||
|
||||
numClasses = masks.shape[1]
|
||||
numDetections = boxes.shape[2]
|
||||
|
||||
# Draw segmentation
|
||||
if not colors:
|
||||
# Generate colors
|
||||
colors = [np.array([0, 0, 0], np.uint8)]
|
||||
for i in range(1, numClasses + 1):
|
||||
colors.append((colors[i - 1] + np.random.randint(0, 256, [3], np.uint8)) / 2)
|
||||
del colors[0]
|
||||
|
||||
boxesToDraw = []
|
||||
for i in range(numDetections):
|
||||
box = boxes[0, 0, i]
|
||||
mask = masks[i]
|
||||
score = box[2]
|
||||
if score > args.thr:
|
||||
classId = int(box[1])
|
||||
left = int(frameW * box[3])
|
||||
top = int(frameH * box[4])
|
||||
right = int(frameW * box[5])
|
||||
bottom = int(frameH * box[6])
|
||||
|
||||
left = max(0, min(left, frameW - 1))
|
||||
top = max(0, min(top, frameH - 1))
|
||||
right = max(0, min(right, frameW - 1))
|
||||
bottom = max(0, min(bottom, frameH - 1))
|
||||
|
||||
boxesToDraw.append([frame, classId, score, left, top, right, bottom])
|
||||
|
||||
classMask = mask[classId]
|
||||
classMask = cv.resize(classMask, (right - left + 1, bottom - top + 1))
|
||||
mask = (classMask > 0.5)
|
||||
|
||||
roi = frame[top:bottom+1, left:right+1][mask]
|
||||
frame[top:bottom+1, left:right+1][mask] = (0.7 * colors[classId] + 0.3 * roi).astype(np.uint8)
|
||||
|
||||
for box in boxesToDraw:
|
||||
drawBox(*box)
|
||||
|
||||
# Put efficiency information.
|
||||
t, _ = net.getPerfProfile()
|
||||
label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
|
||||
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
showLegend(classes)
|
||||
|
||||
cv.imshow(winName, frame)
|
||||
133
samples/dnn/mobilenet_ssd_accuracy.py
Normal file
133
samples/dnn/mobilenet_ssd_accuracy.py
Normal file
@@ -0,0 +1,133 @@
|
||||
from __future__ import print_function
|
||||
# Script to evaluate MobileNet-SSD object detection model trained in TensorFlow
|
||||
# using both TensorFlow and OpenCV. Example:
|
||||
#
|
||||
# python mobilenet_ssd_accuracy.py \
|
||||
# --weights=frozen_inference_graph.pb \
|
||||
# --prototxt=ssd_mobilenet_v1_coco.pbtxt \
|
||||
# --images=val2017 \
|
||||
# --annotations=annotations/instances_val2017.json
|
||||
#
|
||||
# Tested on COCO 2017 object detection dataset, http://cocodataset.org/#download
|
||||
import os
|
||||
import cv2 as cv
|
||||
import json
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Evaluate MobileNet-SSD model using both TensorFlow and OpenCV. '
|
||||
'COCO evaluation framework is required: http://cocodataset.org')
|
||||
parser.add_argument('--weights', required=True,
|
||||
help='Path to frozen_inference_graph.pb of MobileNet-SSD model. '
|
||||
'Download it from http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_11_06_2017.tar.gz')
|
||||
parser.add_argument('--prototxt', help='Path to ssd_mobilenet_v1_coco.pbtxt from opencv_extra.', required=True)
|
||||
parser.add_argument('--images', help='Path to COCO validation images directory.', required=True)
|
||||
parser.add_argument('--annotations', help='Path to COCO annotations file.', required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
### Get OpenCV predictions #####################################################
|
||||
net = cv.dnn.readNetFromTensorflow(cv.samples.findFile(args.weights), cv.samples.findFile(args.prototxt))
|
||||
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
|
||||
|
||||
detections = []
|
||||
for imgName in os.listdir(args.images):
|
||||
inp = cv.imread(cv.samples.findFile(os.path.join(args.images, imgName)))
|
||||
rows = inp.shape[0]
|
||||
cols = inp.shape[1]
|
||||
inp = cv.resize(inp, (300, 300))
|
||||
|
||||
net.setInput(cv.dnn.blobFromImage(inp, 1.0/127.5, (300, 300), (127.5, 127.5, 127.5), True))
|
||||
out = net.forward()
|
||||
|
||||
for i in range(out.shape[2]):
|
||||
score = float(out[0, 0, i, 2])
|
||||
# Confidence threshold is in prototxt.
|
||||
classId = int(out[0, 0, i, 1])
|
||||
|
||||
x = out[0, 0, i, 3] * cols
|
||||
y = out[0, 0, i, 4] * rows
|
||||
w = out[0, 0, i, 5] * cols - x
|
||||
h = out[0, 0, i, 6] * rows - y
|
||||
detections.append({
|
||||
"image_id": int(imgName.rstrip('0')[:imgName.rfind('.')]),
|
||||
"category_id": classId,
|
||||
"bbox": [x, y, w, h],
|
||||
"score": score
|
||||
})
|
||||
|
||||
with open('cv_result.json', 'wt') as f:
|
||||
json.dump(detections, f)
|
||||
|
||||
### Get TensorFlow predictions #################################################
|
||||
import tensorflow as tf
|
||||
|
||||
with tf.gfile.FastGFile(args.weights) as f:
|
||||
# Load the model
|
||||
graph_def = tf.GraphDef()
|
||||
graph_def.ParseFromString(f.read())
|
||||
|
||||
with tf.Session() as sess:
|
||||
# Restore session
|
||||
sess.graph.as_default()
|
||||
tf.import_graph_def(graph_def, name='')
|
||||
|
||||
detections = []
|
||||
for imgName in os.listdir(args.images):
|
||||
inp = cv.imread(os.path.join(args.images, imgName))
|
||||
rows = inp.shape[0]
|
||||
cols = inp.shape[1]
|
||||
inp = cv.resize(inp, (300, 300))
|
||||
inp = inp[:, :, [2, 1, 0]] # BGR2RGB
|
||||
out = sess.run([sess.graph.get_tensor_by_name('num_detections:0'),
|
||||
sess.graph.get_tensor_by_name('detection_scores:0'),
|
||||
sess.graph.get_tensor_by_name('detection_boxes:0'),
|
||||
sess.graph.get_tensor_by_name('detection_classes:0')],
|
||||
feed_dict={'image_tensor:0': inp.reshape(1, inp.shape[0], inp.shape[1], 3)})
|
||||
num_detections = int(out[0][0])
|
||||
for i in range(num_detections):
|
||||
classId = int(out[3][0][i])
|
||||
score = float(out[1][0][i])
|
||||
bbox = [float(v) for v in out[2][0][i]]
|
||||
if score > 0.01:
|
||||
x = bbox[1] * cols
|
||||
y = bbox[0] * rows
|
||||
w = bbox[3] * cols - x
|
||||
h = bbox[2] * rows - y
|
||||
detections.append({
|
||||
"image_id": int(imgName.rstrip('0')[:imgName.rfind('.')]),
|
||||
"category_id": classId,
|
||||
"bbox": [x, y, w, h],
|
||||
"score": score
|
||||
})
|
||||
|
||||
with open('tf_result.json', 'wt') as f:
|
||||
json.dump(detections, f)
|
||||
|
||||
### Evaluation part ############################################################
|
||||
|
||||
# %matplotlib inline
|
||||
import matplotlib.pyplot as plt
|
||||
from pycocotools.coco import COCO
|
||||
from pycocotools.cocoeval import COCOeval
|
||||
import numpy as np
|
||||
import skimage.io as io
|
||||
import pylab
|
||||
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
|
||||
|
||||
annType = ['segm','bbox','keypoints']
|
||||
annType = annType[1] #specify type here
|
||||
prefix = 'person_keypoints' if annType=='keypoints' else 'instances'
|
||||
print('Running demo for *%s* results.'%(annType))
|
||||
|
||||
#initialize COCO ground truth api
|
||||
cocoGt=COCO(args.annotations)
|
||||
|
||||
#initialize COCO detections api
|
||||
for resFile in ['tf_result.json', 'cv_result.json']:
|
||||
print(resFile)
|
||||
cocoDt=cocoGt.loadRes(resFile)
|
||||
|
||||
cocoEval = COCOeval(cocoGt,cocoDt,annType)
|
||||
cocoEval.evaluate()
|
||||
cocoEval.accumulate()
|
||||
cocoEval.summarize()
|
||||
166
samples/dnn/models.yml
Normal file
166
samples/dnn/models.yml
Normal file
@@ -0,0 +1,166 @@
|
||||
%YAML 1.0
|
||||
---
|
||||
################################################################################
|
||||
# Object detection models.
|
||||
################################################################################
|
||||
|
||||
# OpenCV's face detection network
|
||||
opencv_fd:
|
||||
load_info:
|
||||
url: "https://github.com/opencv/opencv_3rdparty/raw/dnn_samples_face_detector_20170830/res10_300x300_ssd_iter_140000.caffemodel"
|
||||
sha1: "15aa726b4d46d9f023526d85537db81cbc8dd566"
|
||||
model: "opencv_face_detector.caffemodel"
|
||||
config: "opencv_face_detector.prototxt"
|
||||
mean: [104, 177, 123]
|
||||
scale: 1.0
|
||||
width: 300
|
||||
height: 300
|
||||
rgb: false
|
||||
sample: "object_detection"
|
||||
|
||||
# YOLO4 object detection family from Darknet (https://github.com/AlexeyAB/darknet)
|
||||
# YOLO object detection family from Darknet (https://pjreddie.com/darknet/yolo/)
|
||||
# Might be used for all YOLOv2, TinyYolov2, YOLOv3, YOLOv4 and TinyYolov4
|
||||
yolo:
|
||||
load_info:
|
||||
url: "https://pjreddie.com/media/files/yolov3.weights"
|
||||
sha1: "520878f12e97cf820529daea502acca380f1cb8e"
|
||||
model: "yolov3.weights"
|
||||
config: "yolov3.cfg"
|
||||
mean: [0, 0, 0]
|
||||
scale: 0.00392
|
||||
width: 416
|
||||
height: 416
|
||||
rgb: true
|
||||
classes: "object_detection_classes_yolov3.txt"
|
||||
sample: "object_detection"
|
||||
|
||||
tiny-yolo-voc:
|
||||
load_info:
|
||||
url: "https://pjreddie.com/media/files/yolov2-tiny-voc.weights"
|
||||
sha1: "24b4bd049fc4fa5f5e95f684a8967e65c625dff9"
|
||||
model: "tiny-yolo-voc.weights"
|
||||
config: "tiny-yolo-voc.cfg"
|
||||
mean: [0, 0, 0]
|
||||
scale: 0.00392
|
||||
width: 416
|
||||
height: 416
|
||||
rgb: true
|
||||
classes: "object_detection_classes_pascal_voc.txt"
|
||||
sample: "object_detection"
|
||||
|
||||
# Caffe implementation of SSD model from https://github.com/chuanqi305/MobileNet-SSD
|
||||
ssd_caffe:
|
||||
load_info:
|
||||
url: "https://drive.google.com/uc?export=download&id=0B3gersZ2cHIxRm5PMWRoTkdHdHc"
|
||||
sha1: "994d30a8afaa9e754d17d2373b2d62a7dfbaaf7a"
|
||||
model: "MobileNetSSD_deploy.caffemodel"
|
||||
config: "MobileNetSSD_deploy.prototxt"
|
||||
mean: [127.5, 127.5, 127.5]
|
||||
scale: 0.007843
|
||||
width: 300
|
||||
height: 300
|
||||
rgb: false
|
||||
classes: "object_detection_classes_pascal_voc.txt"
|
||||
sample: "object_detection"
|
||||
|
||||
# TensorFlow implementation of SSD model from https://github.com/tensorflow/models/tree/master/research/object_detection
|
||||
ssd_tf:
|
||||
load_info:
|
||||
url: "http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_2017_11_17.tar.gz"
|
||||
sha1: "9e4bcdd98f4c6572747679e4ce570de4f03a70e2"
|
||||
download_sha: "6157ddb6da55db2da89dd561eceb7f944928e317"
|
||||
download_name: "ssd_mobilenet_v1_coco_2017_11_17.tar.gz"
|
||||
member: "ssd_mobilenet_v1_coco_2017_11_17/frozen_inference_graph.pb"
|
||||
model: "ssd_mobilenet_v1_coco_2017_11_17.pb"
|
||||
config: "ssd_mobilenet_v1_coco_2017_11_17.pbtxt"
|
||||
mean: [0, 0, 0]
|
||||
scale: 1.0
|
||||
width: 300
|
||||
height: 300
|
||||
rgb: true
|
||||
classes: "object_detection_classes_coco.txt"
|
||||
sample: "object_detection"
|
||||
|
||||
# TensorFlow implementation of Faster-RCNN model from https://github.com/tensorflow/models/tree/master/research/object_detection
|
||||
faster_rcnn_tf:
|
||||
load_info:
|
||||
url: "http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_v2_coco_2018_01_28.tar.gz"
|
||||
sha1: "f2e4bf386b9bb3e25ddfcbbd382c20f417e444f3"
|
||||
download_sha: "c710f25e5c6a3ce85fe793d5bf266d581ab1c230"
|
||||
download_name: "faster_rcnn_inception_v2_coco_2018_01_28.tar.gz"
|
||||
member: "faster_rcnn_inception_v2_coco_2018_01_28/frozen_inference_graph.pb"
|
||||
model: "faster_rcnn_inception_v2_coco_2018_01_28.pb"
|
||||
config: "faster_rcnn_inception_v2_coco_2018_01_28.pbtxt"
|
||||
mean: [0, 0, 0]
|
||||
scale: 1.0
|
||||
width: 800
|
||||
height: 600
|
||||
rgb: true
|
||||
sample: "object_detection"
|
||||
|
||||
################################################################################
|
||||
# Image classification models.
|
||||
################################################################################
|
||||
|
||||
# SqueezeNet v1.1 from https://github.com/DeepScale/SqueezeNet
|
||||
squeezenet:
|
||||
load_info:
|
||||
url: "https://raw.githubusercontent.com/DeepScale/SqueezeNet/b5c3f1a23713c8b3fd7b801d229f6b04c64374a5/SqueezeNet_v1.1/squeezenet_v1.1.caffemodel"
|
||||
sha1: "3397f026368a45ae236403ccc81cfcbe8ebe1bd0"
|
||||
model: "squeezenet_v1.1.caffemodel"
|
||||
config: "squeezenet_v1.1.prototxt"
|
||||
mean: [0, 0, 0]
|
||||
scale: 1.0
|
||||
width: 227
|
||||
height: 227
|
||||
rgb: false
|
||||
classes: "classification_classes_ILSVRC2012.txt"
|
||||
sample: "classification"
|
||||
|
||||
# Googlenet from https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet
|
||||
googlenet:
|
||||
load_info:
|
||||
url: "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel"
|
||||
sha1: "405fc5acd08a3bb12de8ee5e23a96bec22f08204"
|
||||
model: "bvlc_googlenet.caffemodel"
|
||||
config: "bvlc_googlenet.prototxt"
|
||||
mean: [104, 117, 123]
|
||||
scale: 1.0
|
||||
width: 224
|
||||
height: 224
|
||||
rgb: false
|
||||
classes: "classification_classes_ILSVRC2012.txt"
|
||||
sample: "classification"
|
||||
|
||||
################################################################################
|
||||
# Semantic segmentation models.
|
||||
################################################################################
|
||||
|
||||
# ENet road scene segmentation network from https://github.com/e-lab/ENet-training
|
||||
# Works fine for different input sizes.
|
||||
enet:
|
||||
load_info:
|
||||
url: "https://www.dropbox.com/s/tdde0mawbi5dugq/Enet-model-best.net?dl=1"
|
||||
sha1: "b4123a73bf464b9ebe9cfc4ab9c2d5c72b161315"
|
||||
model: "Enet-model-best.net"
|
||||
mean: [0, 0, 0]
|
||||
scale: 0.00392
|
||||
width: 512
|
||||
height: 256
|
||||
rgb: true
|
||||
classes: "enet-classes.txt"
|
||||
sample: "segmentation"
|
||||
|
||||
fcn8s:
|
||||
load_info:
|
||||
url: "http://dl.caffe.berkeleyvision.org/fcn8s-heavy-pascal.caffemodel"
|
||||
sha1: "c449ea74dd7d83751d1357d6a8c323fcf4038962"
|
||||
model: "fcn8s-heavy-pascal.caffemodel"
|
||||
config: "fcn8s-heavy-pascal.prototxt"
|
||||
mean: [0, 0, 0]
|
||||
scale: 1.0
|
||||
width: 500
|
||||
height: 500
|
||||
rgb: false
|
||||
sample: "segmentation"
|
||||
471
samples/dnn/object_detection.cpp
Normal file
471
samples/dnn/object_detection.cpp
Normal file
@@ -0,0 +1,471 @@
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
|
||||
#ifdef CV_CXX11
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <queue>
|
||||
#endif
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
std::string keys =
|
||||
"{ help h | | Print help message. }"
|
||||
"{ @alias | | An alias name of model to extract preprocessing parameters from models.yml file. }"
|
||||
"{ zoo | models.yml | An optional path to file with preprocessing parameters }"
|
||||
"{ device | 0 | camera device number. }"
|
||||
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera. }"
|
||||
"{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
|
||||
"{ classes | | Optional path to a text file with names of classes to label detected objects. }"
|
||||
"{ thr | .5 | Confidence threshold. }"
|
||||
"{ nms | .4 | Non-maximum suppression threshold. }"
|
||||
"{ backend | 0 | Choose one of computation backends: "
|
||||
"0: automatically (by default), "
|
||||
"1: Halide language (http://halide-lang.org/), "
|
||||
"2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"3: OpenCV implementation }"
|
||||
"{ target | 0 | Choose one of target computation devices: "
|
||||
"0: CPU target (by default), "
|
||||
"1: OpenCL, "
|
||||
"2: OpenCL fp16 (half-float precision), "
|
||||
"3: VPU }"
|
||||
"{ async | 0 | Number of asynchronous forwards at the same time. "
|
||||
"Choose 0 for synchronous mode }";
|
||||
|
||||
using namespace cv;
|
||||
using namespace dnn;
|
||||
|
||||
float confThreshold, nmsThreshold;
|
||||
std::vector<std::string> classes;
|
||||
|
||||
inline void preprocess(const Mat& frame, Net& net, Size inpSize, float scale,
|
||||
const Scalar& mean, bool swapRB);
|
||||
|
||||
void postprocess(Mat& frame, const std::vector<Mat>& out, Net& net, int backend);
|
||||
|
||||
void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame);
|
||||
|
||||
void callback(int pos, void* userdata);
|
||||
|
||||
#ifdef CV_CXX11
|
||||
template <typename T>
|
||||
class QueueFPS : public std::queue<T>
|
||||
{
|
||||
public:
|
||||
QueueFPS() : counter(0) {}
|
||||
|
||||
void push(const T& entry)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
std::queue<T>::push(entry);
|
||||
counter += 1;
|
||||
if (counter == 1)
|
||||
{
|
||||
// Start counting from a second frame (warmup).
|
||||
tm.reset();
|
||||
tm.start();
|
||||
}
|
||||
}
|
||||
|
||||
T get()
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
T entry = this->front();
|
||||
this->pop();
|
||||
return entry;
|
||||
}
|
||||
|
||||
float getFPS()
|
||||
{
|
||||
tm.stop();
|
||||
double fps = counter / tm.getTimeSec();
|
||||
tm.start();
|
||||
return static_cast<float>(fps);
|
||||
}
|
||||
|
||||
void clear()
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
while (!this->empty())
|
||||
this->pop();
|
||||
}
|
||||
|
||||
unsigned int counter;
|
||||
|
||||
private:
|
||||
TickMeter tm;
|
||||
std::mutex mutex;
|
||||
};
|
||||
#endif // CV_CXX11
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
|
||||
const std::string modelName = parser.get<String>("@alias");
|
||||
const std::string zooFile = parser.get<String>("zoo");
|
||||
|
||||
keys += genPreprocArguments(modelName, zooFile);
|
||||
|
||||
parser = CommandLineParser(argc, argv, keys);
|
||||
parser.about("Use this script to run object detection deep learning networks using OpenCV.");
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
confThreshold = parser.get<float>("thr");
|
||||
nmsThreshold = parser.get<float>("nms");
|
||||
float scale = parser.get<float>("scale");
|
||||
Scalar mean = parser.get<Scalar>("mean");
|
||||
bool swapRB = parser.get<bool>("rgb");
|
||||
int inpWidth = parser.get<int>("width");
|
||||
int inpHeight = parser.get<int>("height");
|
||||
size_t asyncNumReq = parser.get<int>("async");
|
||||
CV_Assert(parser.has("model"));
|
||||
std::string modelPath = findFile(parser.get<String>("model"));
|
||||
std::string configPath = findFile(parser.get<String>("config"));
|
||||
|
||||
// Open file with classes names.
|
||||
if (parser.has("classes"))
|
||||
{
|
||||
std::string file = parser.get<String>("classes");
|
||||
std::ifstream ifs(file.c_str());
|
||||
if (!ifs.is_open())
|
||||
CV_Error(Error::StsError, "File " + file + " not found");
|
||||
std::string line;
|
||||
while (std::getline(ifs, line))
|
||||
{
|
||||
classes.push_back(line);
|
||||
}
|
||||
}
|
||||
|
||||
// Load a model.
|
||||
Net net = readNet(modelPath, configPath, parser.get<String>("framework"));
|
||||
int backend = parser.get<int>("backend");
|
||||
net.setPreferableBackend(backend);
|
||||
net.setPreferableTarget(parser.get<int>("target"));
|
||||
std::vector<String> outNames = net.getUnconnectedOutLayersNames();
|
||||
|
||||
// Create a window
|
||||
static const std::string kWinName = "Deep learning object detection in OpenCV";
|
||||
namedWindow(kWinName, WINDOW_NORMAL);
|
||||
int initialConf = (int)(confThreshold * 100);
|
||||
createTrackbar("Confidence threshold, %", kWinName, &initialConf, 99, callback);
|
||||
|
||||
// Open a video file or an image file or a camera stream.
|
||||
VideoCapture cap;
|
||||
if (parser.has("input"))
|
||||
cap.open(parser.get<String>("input"));
|
||||
else
|
||||
cap.open(parser.get<int>("device"));
|
||||
|
||||
#ifdef CV_CXX11
|
||||
bool process = true;
|
||||
|
||||
// Frames capturing thread
|
||||
QueueFPS<Mat> framesQueue;
|
||||
std::thread framesThread([&](){
|
||||
Mat frame;
|
||||
while (process)
|
||||
{
|
||||
cap >> frame;
|
||||
if (!frame.empty())
|
||||
framesQueue.push(frame.clone());
|
||||
else
|
||||
break;
|
||||
}
|
||||
});
|
||||
|
||||
// Frames processing thread
|
||||
QueueFPS<Mat> processedFramesQueue;
|
||||
QueueFPS<std::vector<Mat> > predictionsQueue;
|
||||
std::thread processingThread([&](){
|
||||
std::queue<AsyncArray> futureOutputs;
|
||||
Mat blob;
|
||||
while (process)
|
||||
{
|
||||
// Get a next frame
|
||||
Mat frame;
|
||||
{
|
||||
if (!framesQueue.empty())
|
||||
{
|
||||
frame = framesQueue.get();
|
||||
if (asyncNumReq)
|
||||
{
|
||||
if (futureOutputs.size() == asyncNumReq)
|
||||
frame = Mat();
|
||||
}
|
||||
else
|
||||
framesQueue.clear(); // Skip the rest of frames
|
||||
}
|
||||
}
|
||||
|
||||
// Process the frame
|
||||
if (!frame.empty())
|
||||
{
|
||||
preprocess(frame, net, Size(inpWidth, inpHeight), scale, mean, swapRB);
|
||||
processedFramesQueue.push(frame);
|
||||
|
||||
if (asyncNumReq)
|
||||
{
|
||||
futureOutputs.push(net.forwardAsync());
|
||||
}
|
||||
else
|
||||
{
|
||||
std::vector<Mat> outs;
|
||||
net.forward(outs, outNames);
|
||||
predictionsQueue.push(outs);
|
||||
}
|
||||
}
|
||||
|
||||
while (!futureOutputs.empty() &&
|
||||
futureOutputs.front().wait_for(std::chrono::seconds(0)))
|
||||
{
|
||||
AsyncArray async_out = futureOutputs.front();
|
||||
futureOutputs.pop();
|
||||
Mat out;
|
||||
async_out.get(out);
|
||||
predictionsQueue.push({out});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Postprocessing and rendering loop
|
||||
while (waitKey(1) < 0)
|
||||
{
|
||||
if (predictionsQueue.empty())
|
||||
continue;
|
||||
|
||||
std::vector<Mat> outs = predictionsQueue.get();
|
||||
Mat frame = processedFramesQueue.get();
|
||||
|
||||
postprocess(frame, outs, net, backend);
|
||||
|
||||
if (predictionsQueue.counter > 1)
|
||||
{
|
||||
std::string label = format("Camera: %.2f FPS", framesQueue.getFPS());
|
||||
putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
|
||||
label = format("Network: %.2f FPS", predictionsQueue.getFPS());
|
||||
putText(frame, label, Point(0, 30), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
|
||||
label = format("Skipped frames: %d", framesQueue.counter - predictionsQueue.counter);
|
||||
putText(frame, label, Point(0, 45), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
}
|
||||
imshow(kWinName, frame);
|
||||
}
|
||||
|
||||
process = false;
|
||||
framesThread.join();
|
||||
processingThread.join();
|
||||
|
||||
#else // CV_CXX11
|
||||
if (asyncNumReq)
|
||||
CV_Error(Error::StsNotImplemented, "Asynchronous forward is supported only with Inference Engine backend.");
|
||||
|
||||
// Process frames.
|
||||
Mat frame, blob;
|
||||
while (waitKey(1) < 0)
|
||||
{
|
||||
cap >> frame;
|
||||
if (frame.empty())
|
||||
{
|
||||
waitKey();
|
||||
break;
|
||||
}
|
||||
|
||||
preprocess(frame, net, Size(inpWidth, inpHeight), scale, mean, swapRB);
|
||||
|
||||
std::vector<Mat> outs;
|
||||
net.forward(outs, outNames);
|
||||
|
||||
postprocess(frame, outs, net, backend);
|
||||
|
||||
// Put efficiency information.
|
||||
std::vector<double> layersTimes;
|
||||
double freq = getTickFrequency() / 1000;
|
||||
double t = net.getPerfProfile(layersTimes) / freq;
|
||||
std::string label = format("Inference time: %.2f ms", t);
|
||||
putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
|
||||
imshow(kWinName, frame);
|
||||
}
|
||||
#endif // CV_CXX11
|
||||
return 0;
|
||||
}
|
||||
|
||||
inline void preprocess(const Mat& frame, Net& net, Size inpSize, float scale,
|
||||
const Scalar& mean, bool swapRB)
|
||||
{
|
||||
static Mat blob;
|
||||
// Create a 4D blob from a frame.
|
||||
if (inpSize.width <= 0) inpSize.width = frame.cols;
|
||||
if (inpSize.height <= 0) inpSize.height = frame.rows;
|
||||
blobFromImage(frame, blob, 1.0, inpSize, Scalar(), swapRB, false, CV_8U);
|
||||
|
||||
// Run a model.
|
||||
net.setInput(blob, "", scale, mean);
|
||||
if (net.getLayer(0)->outputNameToIndex("im_info") != -1) // Faster-RCNN or R-FCN
|
||||
{
|
||||
resize(frame, frame, inpSize);
|
||||
Mat imInfo = (Mat_<float>(1, 3) << inpSize.height, inpSize.width, 1.6f);
|
||||
net.setInput(imInfo, "im_info");
|
||||
}
|
||||
}
|
||||
|
||||
void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net, int backend)
|
||||
{
|
||||
static std::vector<int> outLayers = net.getUnconnectedOutLayers();
|
||||
static std::string outLayerType = net.getLayer(outLayers[0])->type;
|
||||
|
||||
std::vector<int> classIds;
|
||||
std::vector<float> confidences;
|
||||
std::vector<Rect> boxes;
|
||||
if (outLayerType == "DetectionOutput")
|
||||
{
|
||||
// Network produces output blob with a shape 1x1xNx7 where N is a number of
|
||||
// detections and an every detection is a vector of values
|
||||
// [batchId, classId, confidence, left, top, right, bottom]
|
||||
CV_Assert(outs.size() > 0);
|
||||
for (size_t k = 0; k < outs.size(); k++)
|
||||
{
|
||||
float* data = (float*)outs[k].data;
|
||||
for (size_t i = 0; i < outs[k].total(); i += 7)
|
||||
{
|
||||
float confidence = data[i + 2];
|
||||
if (confidence > confThreshold)
|
||||
{
|
||||
int left = (int)data[i + 3];
|
||||
int top = (int)data[i + 4];
|
||||
int right = (int)data[i + 5];
|
||||
int bottom = (int)data[i + 6];
|
||||
int width = right - left + 1;
|
||||
int height = bottom - top + 1;
|
||||
if (width <= 2 || height <= 2)
|
||||
{
|
||||
left = (int)(data[i + 3] * frame.cols);
|
||||
top = (int)(data[i + 4] * frame.rows);
|
||||
right = (int)(data[i + 5] * frame.cols);
|
||||
bottom = (int)(data[i + 6] * frame.rows);
|
||||
width = right - left + 1;
|
||||
height = bottom - top + 1;
|
||||
}
|
||||
classIds.push_back((int)(data[i + 1]) - 1); // Skip 0th background class id.
|
||||
boxes.push_back(Rect(left, top, width, height));
|
||||
confidences.push_back(confidence);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (outLayerType == "Region")
|
||||
{
|
||||
for (size_t i = 0; i < outs.size(); ++i)
|
||||
{
|
||||
// Network produces output blob with a shape NxC where N is a number of
|
||||
// detected objects and C is a number of classes + 4 where the first 4
|
||||
// numbers are [center_x, center_y, width, height]
|
||||
float* data = (float*)outs[i].data;
|
||||
for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
|
||||
{
|
||||
Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
|
||||
Point classIdPoint;
|
||||
double confidence;
|
||||
minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
|
||||
if (confidence > confThreshold)
|
||||
{
|
||||
int centerX = (int)(data[0] * frame.cols);
|
||||
int centerY = (int)(data[1] * frame.rows);
|
||||
int width = (int)(data[2] * frame.cols);
|
||||
int height = (int)(data[3] * frame.rows);
|
||||
int left = centerX - width / 2;
|
||||
int top = centerY - height / 2;
|
||||
|
||||
classIds.push_back(classIdPoint.x);
|
||||
confidences.push_back((float)confidence);
|
||||
boxes.push_back(Rect(left, top, width, height));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
CV_Error(Error::StsNotImplemented, "Unknown output layer type: " + outLayerType);
|
||||
|
||||
// NMS is used inside Region layer only on DNN_BACKEND_OPENCV for another backends we need NMS in sample
|
||||
// or NMS is required if number of outputs > 1
|
||||
if (outLayers.size() > 1 || (outLayerType == "Region" && backend != DNN_BACKEND_OPENCV))
|
||||
{
|
||||
std::map<int, std::vector<size_t> > class2indices;
|
||||
for (size_t i = 0; i < classIds.size(); i++)
|
||||
{
|
||||
if (confidences[i] >= confThreshold)
|
||||
{
|
||||
class2indices[classIds[i]].push_back(i);
|
||||
}
|
||||
}
|
||||
std::vector<Rect> nmsBoxes;
|
||||
std::vector<float> nmsConfidences;
|
||||
std::vector<int> nmsClassIds;
|
||||
for (std::map<int, std::vector<size_t> >::iterator it = class2indices.begin(); it != class2indices.end(); ++it)
|
||||
{
|
||||
std::vector<Rect> localBoxes;
|
||||
std::vector<float> localConfidences;
|
||||
std::vector<size_t> classIndices = it->second;
|
||||
for (size_t i = 0; i < classIndices.size(); i++)
|
||||
{
|
||||
localBoxes.push_back(boxes[classIndices[i]]);
|
||||
localConfidences.push_back(confidences[classIndices[i]]);
|
||||
}
|
||||
std::vector<int> nmsIndices;
|
||||
NMSBoxes(localBoxes, localConfidences, confThreshold, nmsThreshold, nmsIndices);
|
||||
for (size_t i = 0; i < nmsIndices.size(); i++)
|
||||
{
|
||||
size_t idx = nmsIndices[i];
|
||||
nmsBoxes.push_back(localBoxes[idx]);
|
||||
nmsConfidences.push_back(localConfidences[idx]);
|
||||
nmsClassIds.push_back(it->first);
|
||||
}
|
||||
}
|
||||
boxes = nmsBoxes;
|
||||
classIds = nmsClassIds;
|
||||
confidences = nmsConfidences;
|
||||
}
|
||||
|
||||
for (size_t idx = 0; idx < boxes.size(); ++idx)
|
||||
{
|
||||
Rect box = boxes[idx];
|
||||
drawPred(classIds[idx], confidences[idx], box.x, box.y,
|
||||
box.x + box.width, box.y + box.height, frame);
|
||||
}
|
||||
}
|
||||
|
||||
void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame)
|
||||
{
|
||||
rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0));
|
||||
|
||||
std::string label = format("%.2f", conf);
|
||||
if (!classes.empty())
|
||||
{
|
||||
CV_Assert(classId < (int)classes.size());
|
||||
label = classes[classId] + ": " + label;
|
||||
}
|
||||
|
||||
int baseLine;
|
||||
Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
|
||||
|
||||
top = max(top, labelSize.height);
|
||||
rectangle(frame, Point(left, top - labelSize.height),
|
||||
Point(left + labelSize.width, top + baseLine), Scalar::all(255), FILLED);
|
||||
putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar());
|
||||
}
|
||||
|
||||
void callback(int pos, void*)
|
||||
{
|
||||
confThreshold = pos * 0.01f;
|
||||
}
|
||||
322
samples/dnn/object_detection.py
Normal file
322
samples/dnn/object_detection.py
Normal file
@@ -0,0 +1,322 @@
|
||||
import cv2 as cv
|
||||
import argparse
|
||||
import numpy as np
|
||||
import sys
|
||||
import time
|
||||
from threading import Thread
|
||||
if sys.version_info[0] == 2:
|
||||
import Queue as queue
|
||||
else:
|
||||
import queue
|
||||
|
||||
from common import *
|
||||
from tf_text_graph_common import readTextMessage
|
||||
from tf_text_graph_ssd import createSSDGraph
|
||||
from tf_text_graph_faster_rcnn import createFasterRCNNGraph
|
||||
|
||||
backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV)
|
||||
targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL)
|
||||
|
||||
parser = argparse.ArgumentParser(add_help=False)
|
||||
parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
|
||||
help='An optional path to file with preprocessing parameters.')
|
||||
parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--out_tf_graph', default='graph.pbtxt',
|
||||
help='For models from TensorFlow Object Detection API, you may '
|
||||
'pass a .config file which was used for training through --config '
|
||||
'argument. This way an additional .pbtxt file with TensorFlow graph will be created.')
|
||||
parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet', 'dldt'],
|
||||
help='Optional name of an origin framework of the model. '
|
||||
'Detect it automatically if it does not set.')
|
||||
parser.add_argument('--thr', type=float, default=0.5, help='Confidence threshold')
|
||||
parser.add_argument('--nms', type=float, default=0.4, help='Non-maximum suppression threshold')
|
||||
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
|
||||
help="Choose one of computation backends: "
|
||||
"%d: automatically (by default), "
|
||||
"%d: Halide language (http://halide-lang.org/), "
|
||||
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"%d: OpenCV implementation" % backends)
|
||||
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
|
||||
help='Choose one of target computation devices: '
|
||||
'%d: CPU target (by default), '
|
||||
'%d: OpenCL, '
|
||||
'%d: OpenCL fp16 (half-float precision), '
|
||||
'%d: NCS2 VPU, '
|
||||
'%d: HDDL VPU' % targets)
|
||||
parser.add_argument('--async', type=int, default=0,
|
||||
dest='asyncN',
|
||||
help='Number of asynchronous forwards at the same time. '
|
||||
'Choose 0 for synchronous mode')
|
||||
args, _ = parser.parse_known_args()
|
||||
add_preproc_args(args.zoo, parser, 'object_detection')
|
||||
parser = argparse.ArgumentParser(parents=[parser],
|
||||
description='Use this script to run object detection deep learning networks using OpenCV.',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
args = parser.parse_args()
|
||||
|
||||
args.model = findFile(args.model)
|
||||
args.config = findFile(args.config)
|
||||
args.classes = findFile(args.classes)
|
||||
|
||||
# If config specified, try to load it as TensorFlow Object Detection API's pipeline.
|
||||
config = readTextMessage(args.config)
|
||||
if 'model' in config:
|
||||
print('TensorFlow Object Detection API config detected')
|
||||
if 'ssd' in config['model'][0]:
|
||||
print('Preparing text graph representation for SSD model: ' + args.out_tf_graph)
|
||||
createSSDGraph(args.model, args.config, args.out_tf_graph)
|
||||
args.config = args.out_tf_graph
|
||||
elif 'faster_rcnn' in config['model'][0]:
|
||||
print('Preparing text graph representation for Faster-RCNN model: ' + args.out_tf_graph)
|
||||
createFasterRCNNGraph(args.model, args.config, args.out_tf_graph)
|
||||
args.config = args.out_tf_graph
|
||||
|
||||
|
||||
# Load names of classes
|
||||
classes = None
|
||||
if args.classes:
|
||||
with open(args.classes, 'rt') as f:
|
||||
classes = f.read().rstrip('\n').split('\n')
|
||||
|
||||
# Load a network
|
||||
net = cv.dnn.readNet(cv.samples.findFile(args.model), cv.samples.findFile(args.config), args.framework)
|
||||
net.setPreferableBackend(args.backend)
|
||||
net.setPreferableTarget(args.target)
|
||||
outNames = net.getUnconnectedOutLayersNames()
|
||||
|
||||
confThreshold = args.thr
|
||||
nmsThreshold = args.nms
|
||||
|
||||
def postprocess(frame, outs):
|
||||
frameHeight = frame.shape[0]
|
||||
frameWidth = frame.shape[1]
|
||||
|
||||
def drawPred(classId, conf, left, top, right, bottom):
|
||||
# Draw a bounding box.
|
||||
cv.rectangle(frame, (left, top), (right, bottom), (0, 255, 0))
|
||||
|
||||
label = '%.2f' % conf
|
||||
|
||||
# Print a label of class.
|
||||
if classes:
|
||||
assert(classId < len(classes))
|
||||
label = '%s: %s' % (classes[classId], label)
|
||||
|
||||
labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
|
||||
top = max(top, labelSize[1])
|
||||
cv.rectangle(frame, (left, top - labelSize[1]), (left + labelSize[0], top + baseLine), (255, 255, 255), cv.FILLED)
|
||||
cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
|
||||
|
||||
layerNames = net.getLayerNames()
|
||||
lastLayerId = net.getLayerId(layerNames[-1])
|
||||
lastLayer = net.getLayer(lastLayerId)
|
||||
|
||||
classIds = []
|
||||
confidences = []
|
||||
boxes = []
|
||||
if lastLayer.type == 'DetectionOutput':
|
||||
# Network produces output blob with a shape 1x1xNx7 where N is a number of
|
||||
# detections and an every detection is a vector of values
|
||||
# [batchId, classId, confidence, left, top, right, bottom]
|
||||
for out in outs:
|
||||
for detection in out[0, 0]:
|
||||
confidence = detection[2]
|
||||
if confidence > confThreshold:
|
||||
left = int(detection[3])
|
||||
top = int(detection[4])
|
||||
right = int(detection[5])
|
||||
bottom = int(detection[6])
|
||||
width = right - left + 1
|
||||
height = bottom - top + 1
|
||||
if width <= 2 or height <= 2:
|
||||
left = int(detection[3] * frameWidth)
|
||||
top = int(detection[4] * frameHeight)
|
||||
right = int(detection[5] * frameWidth)
|
||||
bottom = int(detection[6] * frameHeight)
|
||||
width = right - left + 1
|
||||
height = bottom - top + 1
|
||||
classIds.append(int(detection[1]) - 1) # Skip background label
|
||||
confidences.append(float(confidence))
|
||||
boxes.append([left, top, width, height])
|
||||
elif lastLayer.type == 'Region':
|
||||
# Network produces output blob with a shape NxC where N is a number of
|
||||
# detected objects and C is a number of classes + 4 where the first 4
|
||||
# numbers are [center_x, center_y, width, height]
|
||||
for out in outs:
|
||||
for detection in out:
|
||||
scores = detection[5:]
|
||||
classId = np.argmax(scores)
|
||||
confidence = scores[classId]
|
||||
if confidence > confThreshold:
|
||||
center_x = int(detection[0] * frameWidth)
|
||||
center_y = int(detection[1] * frameHeight)
|
||||
width = int(detection[2] * frameWidth)
|
||||
height = int(detection[3] * frameHeight)
|
||||
left = int(center_x - width / 2)
|
||||
top = int(center_y - height / 2)
|
||||
classIds.append(classId)
|
||||
confidences.append(float(confidence))
|
||||
boxes.append([left, top, width, height])
|
||||
else:
|
||||
print('Unknown output layer type: ' + lastLayer.type)
|
||||
exit()
|
||||
|
||||
# NMS is used inside Region layer only on DNN_BACKEND_OPENCV for another backends we need NMS in sample
|
||||
# or NMS is required if number of outputs > 1
|
||||
if len(outNames) > 1 or lastLayer.type == 'Region' and args.backend != cv.dnn.DNN_BACKEND_OPENCV:
|
||||
indices = []
|
||||
classIds = np.array(classIds)
|
||||
boxes = np.array(boxes)
|
||||
confidences = np.array(confidences)
|
||||
unique_classes = set(classIds)
|
||||
for cl in unique_classes:
|
||||
class_indices = np.where(classIds == cl)[0]
|
||||
conf = confidences[class_indices]
|
||||
box = boxes[class_indices].tolist()
|
||||
nms_indices = cv.dnn.NMSBoxes(box, conf, confThreshold, nmsThreshold)
|
||||
nms_indices = nms_indices[:, 0] if len(nms_indices) else []
|
||||
indices.extend(class_indices[nms_indices])
|
||||
else:
|
||||
indices = np.arange(0, len(classIds))
|
||||
|
||||
for i in indices:
|
||||
box = boxes[i]
|
||||
left = box[0]
|
||||
top = box[1]
|
||||
width = box[2]
|
||||
height = box[3]
|
||||
drawPred(classIds[i], confidences[i], left, top, left + width, top + height)
|
||||
|
||||
# Process inputs
|
||||
winName = 'Deep learning object detection in OpenCV'
|
||||
cv.namedWindow(winName, cv.WINDOW_NORMAL)
|
||||
|
||||
def callback(pos):
|
||||
global confThreshold
|
||||
confThreshold = pos / 100.0
|
||||
|
||||
cv.createTrackbar('Confidence threshold, %', winName, int(confThreshold * 100), 99, callback)
|
||||
|
||||
cap = cv.VideoCapture(cv.samples.findFileOrKeep(args.input) if args.input else 0)
|
||||
|
||||
class QueueFPS(queue.Queue):
|
||||
def __init__(self):
|
||||
queue.Queue.__init__(self)
|
||||
self.startTime = 0
|
||||
self.counter = 0
|
||||
|
||||
def put(self, v):
|
||||
queue.Queue.put(self, v)
|
||||
self.counter += 1
|
||||
if self.counter == 1:
|
||||
self.startTime = time.time()
|
||||
|
||||
def getFPS(self):
|
||||
return self.counter / (time.time() - self.startTime)
|
||||
|
||||
|
||||
process = True
|
||||
|
||||
#
|
||||
# Frames capturing thread
|
||||
#
|
||||
framesQueue = QueueFPS()
|
||||
def framesThreadBody():
|
||||
global framesQueue, process
|
||||
|
||||
while process:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
break
|
||||
framesQueue.put(frame)
|
||||
|
||||
|
||||
#
|
||||
# Frames processing thread
|
||||
#
|
||||
processedFramesQueue = queue.Queue()
|
||||
predictionsQueue = QueueFPS()
|
||||
def processingThreadBody():
|
||||
global processedFramesQueue, predictionsQueue, args, process
|
||||
|
||||
futureOutputs = []
|
||||
while process:
|
||||
# Get a next frame
|
||||
frame = None
|
||||
try:
|
||||
frame = framesQueue.get_nowait()
|
||||
|
||||
if args.asyncN:
|
||||
if len(futureOutputs) == args.asyncN:
|
||||
frame = None # Skip the frame
|
||||
else:
|
||||
framesQueue.queue.clear() # Skip the rest of frames
|
||||
except queue.Empty:
|
||||
pass
|
||||
|
||||
|
||||
if not frame is None:
|
||||
frameHeight = frame.shape[0]
|
||||
frameWidth = frame.shape[1]
|
||||
|
||||
# Create a 4D blob from a frame.
|
||||
inpWidth = args.width if args.width else frameWidth
|
||||
inpHeight = args.height if args.height else frameHeight
|
||||
blob = cv.dnn.blobFromImage(frame, size=(inpWidth, inpHeight), swapRB=args.rgb, ddepth=cv.CV_8U)
|
||||
processedFramesQueue.put(frame)
|
||||
|
||||
# Run a model
|
||||
net.setInput(blob, scalefactor=args.scale, mean=args.mean)
|
||||
if net.getLayer(0).outputNameToIndex('im_info') != -1: # Faster-RCNN or R-FCN
|
||||
frame = cv.resize(frame, (inpWidth, inpHeight))
|
||||
net.setInput(np.array([[inpHeight, inpWidth, 1.6]], dtype=np.float32), 'im_info')
|
||||
|
||||
if args.asyncN:
|
||||
futureOutputs.append(net.forwardAsync())
|
||||
else:
|
||||
outs = net.forward(outNames)
|
||||
predictionsQueue.put(np.copy(outs))
|
||||
|
||||
while futureOutputs and futureOutputs[0].wait_for(0):
|
||||
out = futureOutputs[0].get()
|
||||
predictionsQueue.put(np.copy([out]))
|
||||
|
||||
del futureOutputs[0]
|
||||
|
||||
|
||||
framesThread = Thread(target=framesThreadBody)
|
||||
framesThread.start()
|
||||
|
||||
processingThread = Thread(target=processingThreadBody)
|
||||
processingThread.start()
|
||||
|
||||
#
|
||||
# Postprocessing and rendering loop
|
||||
#
|
||||
while cv.waitKey(1) < 0:
|
||||
try:
|
||||
# Request prediction first because they put after frames
|
||||
outs = predictionsQueue.get_nowait()
|
||||
frame = processedFramesQueue.get_nowait()
|
||||
|
||||
postprocess(frame, outs)
|
||||
|
||||
# Put efficiency information.
|
||||
if predictionsQueue.counter > 1:
|
||||
label = 'Camera: %.2f FPS' % (framesQueue.getFPS())
|
||||
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
label = 'Network: %.2f FPS' % (predictionsQueue.getFPS())
|
||||
cv.putText(frame, label, (0, 30), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
label = 'Skipped frames: %d' % (framesQueue.counter - predictionsQueue.counter)
|
||||
cv.putText(frame, label, (0, 45), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
cv.imshow(winName, frame)
|
||||
except queue.Empty:
|
||||
pass
|
||||
|
||||
|
||||
process = False
|
||||
framesThread.join()
|
||||
processingThread.join()
|
||||
157
samples/dnn/openpose.cpp
Normal file
157
samples/dnn/openpose.cpp
Normal file
@@ -0,0 +1,157 @@
|
||||
//
|
||||
// this sample demonstrates the use of pretrained openpose networks with opencv's dnn module.
|
||||
//
|
||||
// it can be used for body pose detection, using either the COCO model(18 parts):
|
||||
// http://posefs1.perception.cs.cmu.edu/OpenPose/models/pose/coco/pose_iter_440000.caffemodel
|
||||
// https://raw.githubusercontent.com/opencv/opencv_extra/master/testdata/dnn/openpose_pose_coco.prototxt
|
||||
//
|
||||
// or the MPI model(16 parts):
|
||||
// http://posefs1.perception.cs.cmu.edu/OpenPose/models/pose/mpi/pose_iter_160000.caffemodel
|
||||
// https://raw.githubusercontent.com/opencv/opencv_extra/master/testdata/dnn/openpose_pose_mpi_faster_4_stages.prototxt
|
||||
//
|
||||
// (to simplify this sample, the body models are restricted to a single person.)
|
||||
//
|
||||
//
|
||||
// you can also try the hand pose model:
|
||||
// http://posefs1.perception.cs.cmu.edu/OpenPose/models/hand/pose_iter_102000.caffemodel
|
||||
// https://raw.githubusercontent.com/CMU-Perceptual-Computing-Lab/openpose/master/models/hand/pose_deploy.prototxt
|
||||
//
|
||||
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
using namespace cv;
|
||||
using namespace cv::dnn;
|
||||
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
|
||||
|
||||
// connection table, in the format [model_id][pair_id][from/to]
|
||||
// please look at the nice explanation at the bottom of:
|
||||
// https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/doc/output.md
|
||||
//
|
||||
const int POSE_PAIRS[3][20][2] = {
|
||||
{ // COCO body
|
||||
{1,2}, {1,5}, {2,3},
|
||||
{3,4}, {5,6}, {6,7},
|
||||
{1,8}, {8,9}, {9,10},
|
||||
{1,11}, {11,12}, {12,13},
|
||||
{1,0}, {0,14},
|
||||
{14,16}, {0,15}, {15,17}
|
||||
},
|
||||
{ // MPI body
|
||||
{0,1}, {1,2}, {2,3},
|
||||
{3,4}, {1,5}, {5,6},
|
||||
{6,7}, {1,14}, {14,8}, {8,9},
|
||||
{9,10}, {14,11}, {11,12}, {12,13}
|
||||
},
|
||||
{ // hand
|
||||
{0,1}, {1,2}, {2,3}, {3,4}, // thumb
|
||||
{0,5}, {5,6}, {6,7}, {7,8}, // pinkie
|
||||
{0,9}, {9,10}, {10,11}, {11,12}, // middle
|
||||
{0,13}, {13,14}, {14,15}, {15,16}, // ring
|
||||
{0,17}, {17,18}, {18,19}, {19,20} // small
|
||||
}};
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
CommandLineParser parser(argc, argv,
|
||||
"{ h help | false | print this help message }"
|
||||
"{ p proto | | (required) model configuration, e.g. hand/pose.prototxt }"
|
||||
"{ m model | | (required) model weights, e.g. hand/pose_iter_102000.caffemodel }"
|
||||
"{ i image | | (required) path to image file (containing a single person, or hand) }"
|
||||
"{ d dataset | | specify what kind of model was trained. It could be (COCO, MPI, HAND) depends on dataset. }"
|
||||
"{ width | 368 | Preprocess input image by resizing to a specific width. }"
|
||||
"{ height | 368 | Preprocess input image by resizing to a specific height. }"
|
||||
"{ t threshold | 0.1 | threshold or confidence value for the heatmap }"
|
||||
"{ s scale | 0.003922 | scale for blob }"
|
||||
);
|
||||
|
||||
String modelTxt = samples::findFile(parser.get<string>("proto"));
|
||||
String modelBin = samples::findFile(parser.get<string>("model"));
|
||||
String imageFile = samples::findFile(parser.get<String>("image"));
|
||||
String dataset = parser.get<String>("dataset");
|
||||
int W_in = parser.get<int>("width");
|
||||
int H_in = parser.get<int>("height");
|
||||
float thresh = parser.get<float>("threshold");
|
||||
float scale = parser.get<float>("scale");
|
||||
|
||||
if (parser.get<bool>("help") || modelTxt.empty() || modelBin.empty() || imageFile.empty())
|
||||
{
|
||||
cout << "A sample app to demonstrate human or hand pose detection with a pretrained OpenPose dnn." << endl;
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int midx, npairs, nparts;
|
||||
if (!dataset.compare("COCO")) { midx = 0; npairs = 17; nparts = 18; }
|
||||
else if (!dataset.compare("MPI")) { midx = 1; npairs = 14; nparts = 16; }
|
||||
else if (!dataset.compare("HAND")) { midx = 2; npairs = 20; nparts = 22; }
|
||||
else
|
||||
{
|
||||
std::cerr << "Can't interpret dataset parameter: " << dataset << std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// read the network model
|
||||
Net net = readNet(modelBin, modelTxt);
|
||||
// and the image
|
||||
Mat img = imread(imageFile);
|
||||
if (img.empty())
|
||||
{
|
||||
std::cerr << "Can't read image from the file: " << imageFile << std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// send it through the network
|
||||
Mat inputBlob = blobFromImage(img, scale, Size(W_in, H_in), Scalar(0, 0, 0), false, false);
|
||||
net.setInput(inputBlob);
|
||||
Mat result = net.forward();
|
||||
// the result is an array of "heatmaps", the probability of a body part being in location x,y
|
||||
|
||||
int H = result.size[2];
|
||||
int W = result.size[3];
|
||||
|
||||
// find the position of the body parts
|
||||
vector<Point> points(22);
|
||||
for (int n=0; n<nparts; n++)
|
||||
{
|
||||
// Slice heatmap of corresponding body's part.
|
||||
Mat heatMap(H, W, CV_32F, result.ptr(0,n));
|
||||
// 1 maximum per heatmap
|
||||
Point p(-1,-1),pm;
|
||||
double conf;
|
||||
minMaxLoc(heatMap, 0, &conf, 0, &pm);
|
||||
if (conf > thresh)
|
||||
p = pm;
|
||||
points[n] = p;
|
||||
}
|
||||
|
||||
// connect body parts and draw it !
|
||||
float SX = float(img.cols) / W;
|
||||
float SY = float(img.rows) / H;
|
||||
for (int n=0; n<npairs; n++)
|
||||
{
|
||||
// lookup 2 connected body/hand parts
|
||||
Point2f a = points[POSE_PAIRS[midx][n][0]];
|
||||
Point2f b = points[POSE_PAIRS[midx][n][1]];
|
||||
|
||||
// we did not find enough confidence before
|
||||
if (a.x<=0 || a.y<=0 || b.x<=0 || b.y<=0)
|
||||
continue;
|
||||
|
||||
// scale to image size
|
||||
a.x*=SX; a.y*=SY;
|
||||
b.x*=SX; b.y*=SY;
|
||||
|
||||
line(img, a, b, Scalar(0,200,0), 2);
|
||||
circle(img, a, 3, Scalar(0,0,200), -1);
|
||||
circle(img, b, 3, Scalar(0,0,200), -1);
|
||||
}
|
||||
|
||||
imshow("OpenPose", img);
|
||||
waitKey();
|
||||
|
||||
return 0;
|
||||
}
|
||||
122
samples/dnn/openpose.py
Normal file
122
samples/dnn/openpose.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# To use Inference Engine backend, specify location of plugins:
|
||||
# source /opt/intel/computer_vision_sdk/bin/setupvars.sh
|
||||
import cv2 as cv
|
||||
import numpy as np
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='This script is used to demonstrate OpenPose human pose estimation network '
|
||||
'from https://github.com/CMU-Perceptual-Computing-Lab/openpose project using OpenCV. '
|
||||
'The sample and model are simplified and could be used for a single person on the frame.')
|
||||
parser.add_argument('--input', help='Path to image or video. Skip to capture frames from camera')
|
||||
parser.add_argument('--proto', help='Path to .prototxt')
|
||||
parser.add_argument('--model', help='Path to .caffemodel')
|
||||
parser.add_argument('--dataset', help='Specify what kind of model was trained. '
|
||||
'It could be (COCO, MPI, HAND) depends on dataset.')
|
||||
parser.add_argument('--thr', default=0.1, type=float, help='Threshold value for pose parts heat map')
|
||||
parser.add_argument('--width', default=368, type=int, help='Resize input to specific width.')
|
||||
parser.add_argument('--height', default=368, type=int, help='Resize input to specific height.')
|
||||
parser.add_argument('--scale', default=0.003922, type=float, help='Scale for blob.')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.dataset == 'COCO':
|
||||
BODY_PARTS = { "Nose": 0, "Neck": 1, "RShoulder": 2, "RElbow": 3, "RWrist": 4,
|
||||
"LShoulder": 5, "LElbow": 6, "LWrist": 7, "RHip": 8, "RKnee": 9,
|
||||
"RAnkle": 10, "LHip": 11, "LKnee": 12, "LAnkle": 13, "REye": 14,
|
||||
"LEye": 15, "REar": 16, "LEar": 17, "Background": 18 }
|
||||
|
||||
POSE_PAIRS = [ ["Neck", "RShoulder"], ["Neck", "LShoulder"], ["RShoulder", "RElbow"],
|
||||
["RElbow", "RWrist"], ["LShoulder", "LElbow"], ["LElbow", "LWrist"],
|
||||
["Neck", "RHip"], ["RHip", "RKnee"], ["RKnee", "RAnkle"], ["Neck", "LHip"],
|
||||
["LHip", "LKnee"], ["LKnee", "LAnkle"], ["Neck", "Nose"], ["Nose", "REye"],
|
||||
["REye", "REar"], ["Nose", "LEye"], ["LEye", "LEar"] ]
|
||||
elif args.dataset == 'MPI':
|
||||
BODY_PARTS = { "Head": 0, "Neck": 1, "RShoulder": 2, "RElbow": 3, "RWrist": 4,
|
||||
"LShoulder": 5, "LElbow": 6, "LWrist": 7, "RHip": 8, "RKnee": 9,
|
||||
"RAnkle": 10, "LHip": 11, "LKnee": 12, "LAnkle": 13, "Chest": 14,
|
||||
"Background": 15 }
|
||||
|
||||
POSE_PAIRS = [ ["Head", "Neck"], ["Neck", "RShoulder"], ["RShoulder", "RElbow"],
|
||||
["RElbow", "RWrist"], ["Neck", "LShoulder"], ["LShoulder", "LElbow"],
|
||||
["LElbow", "LWrist"], ["Neck", "Chest"], ["Chest", "RHip"], ["RHip", "RKnee"],
|
||||
["RKnee", "RAnkle"], ["Chest", "LHip"], ["LHip", "LKnee"], ["LKnee", "LAnkle"] ]
|
||||
else:
|
||||
assert(args.dataset == 'HAND')
|
||||
BODY_PARTS = { "Wrist": 0,
|
||||
"ThumbMetacarpal": 1, "ThumbProximal": 2, "ThumbMiddle": 3, "ThumbDistal": 4,
|
||||
"IndexFingerMetacarpal": 5, "IndexFingerProximal": 6, "IndexFingerMiddle": 7, "IndexFingerDistal": 8,
|
||||
"MiddleFingerMetacarpal": 9, "MiddleFingerProximal": 10, "MiddleFingerMiddle": 11, "MiddleFingerDistal": 12,
|
||||
"RingFingerMetacarpal": 13, "RingFingerProximal": 14, "RingFingerMiddle": 15, "RingFingerDistal": 16,
|
||||
"LittleFingerMetacarpal": 17, "LittleFingerProximal": 18, "LittleFingerMiddle": 19, "LittleFingerDistal": 20,
|
||||
}
|
||||
|
||||
POSE_PAIRS = [ ["Wrist", "ThumbMetacarpal"], ["ThumbMetacarpal", "ThumbProximal"],
|
||||
["ThumbProximal", "ThumbMiddle"], ["ThumbMiddle", "ThumbDistal"],
|
||||
["Wrist", "IndexFingerMetacarpal"], ["IndexFingerMetacarpal", "IndexFingerProximal"],
|
||||
["IndexFingerProximal", "IndexFingerMiddle"], ["IndexFingerMiddle", "IndexFingerDistal"],
|
||||
["Wrist", "MiddleFingerMetacarpal"], ["MiddleFingerMetacarpal", "MiddleFingerProximal"],
|
||||
["MiddleFingerProximal", "MiddleFingerMiddle"], ["MiddleFingerMiddle", "MiddleFingerDistal"],
|
||||
["Wrist", "RingFingerMetacarpal"], ["RingFingerMetacarpal", "RingFingerProximal"],
|
||||
["RingFingerProximal", "RingFingerMiddle"], ["RingFingerMiddle", "RingFingerDistal"],
|
||||
["Wrist", "LittleFingerMetacarpal"], ["LittleFingerMetacarpal", "LittleFingerProximal"],
|
||||
["LittleFingerProximal", "LittleFingerMiddle"], ["LittleFingerMiddle", "LittleFingerDistal"] ]
|
||||
|
||||
|
||||
inWidth = args.width
|
||||
inHeight = args.height
|
||||
inScale = args.scale
|
||||
|
||||
net = cv.dnn.readNet(cv.samples.findFile(args.proto), cv.samples.findFile(args.model))
|
||||
|
||||
cap = cv.VideoCapture(args.input if args.input else 0)
|
||||
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
cv.waitKey()
|
||||
break
|
||||
|
||||
frameWidth = frame.shape[1]
|
||||
frameHeight = frame.shape[0]
|
||||
inp = cv.dnn.blobFromImage(frame, inScale, (inWidth, inHeight),
|
||||
(0, 0, 0), swapRB=False, crop=False)
|
||||
net.setInput(inp)
|
||||
out = net.forward()
|
||||
|
||||
assert(len(BODY_PARTS) <= out.shape[1])
|
||||
|
||||
points = []
|
||||
for i in range(len(BODY_PARTS)):
|
||||
# Slice heatmap of corresponding body's part.
|
||||
heatMap = out[0, i, :, :]
|
||||
|
||||
# Originally, we try to find all the local maximums. To simplify a sample
|
||||
# we just find a global one. However only a single pose at the same time
|
||||
# could be detected this way.
|
||||
_, conf, _, point = cv.minMaxLoc(heatMap)
|
||||
x = (frameWidth * point[0]) / out.shape[3]
|
||||
y = (frameHeight * point[1]) / out.shape[2]
|
||||
|
||||
# Add a point if it's confidence is higher than threshold.
|
||||
points.append((int(x), int(y)) if conf > args.thr else None)
|
||||
|
||||
for pair in POSE_PAIRS:
|
||||
partFrom = pair[0]
|
||||
partTo = pair[1]
|
||||
assert(partFrom in BODY_PARTS)
|
||||
assert(partTo in BODY_PARTS)
|
||||
|
||||
idFrom = BODY_PARTS[partFrom]
|
||||
idTo = BODY_PARTS[partTo]
|
||||
|
||||
if points[idFrom] and points[idTo]:
|
||||
cv.line(frame, points[idFrom], points[idTo], (0, 255, 0), 3)
|
||||
cv.ellipse(frame, points[idFrom], (3, 3), 0, 0, 360, (0, 0, 255), cv.FILLED)
|
||||
cv.ellipse(frame, points[idTo], (3, 3), 0, 0, 360, (0, 0, 255), cv.FILLED)
|
||||
|
||||
t, _ = net.getPerfProfile()
|
||||
freq = cv.getTickFrequency() / 1000
|
||||
cv.putText(frame, '%.2fms' % (t / freq), (10, 20), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
|
||||
|
||||
cv.imshow('OpenPose using OpenCV', frame)
|
||||
103
samples/dnn/optical_flow.py
Normal file
103
samples/dnn/optical_flow.py
Normal file
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
This sample using FlowNet v2 model to calculate optical flow.
|
||||
Original paper: https://arxiv.org/abs/1612.01925.
|
||||
Original repo: https://github.com/lmb-freiburg/flownet2.
|
||||
|
||||
Download the converted .caffemodel model from https://drive.google.com/open?id=16qvE9VNmU39NttpZwZs81Ga8VYQJDaWZ
|
||||
and .prototxt from https://drive.google.com/file/d/1RyNIUsan1ZOh2hpYIH36A-jofAvJlT6a/view?usp=sharing.
|
||||
Otherwise download original model from https://lmb.informatik.uni-freiburg.de/resources/binaries/flownet2/flownet2-models.tar.gz,
|
||||
convert .h5 model to .caffemodel and modify original .prototxt using .prototxt from link above.
|
||||
'''
|
||||
|
||||
import argparse
|
||||
import os.path
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
|
||||
|
||||
class OpticalFlow(object):
|
||||
def __init__(self, proto, model, height, width):
|
||||
self.net = cv.dnn.readNetFromCaffe(proto, model)
|
||||
self.net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
|
||||
self.height = height
|
||||
self.width = width
|
||||
|
||||
def compute_flow(self, first_img, second_img):
|
||||
inp0 = cv.dnn.blobFromImage(first_img, size=(self.width, self.height))
|
||||
inp1 = cv.dnn.blobFromImage(second_img, size=(self.width, self.height))
|
||||
self.net.setInput(inp0, "img0")
|
||||
self.net.setInput(inp1, "img1")
|
||||
flow = self.net.forward()
|
||||
output = self.motion_to_color(flow)
|
||||
return output
|
||||
|
||||
def motion_to_color(self, flow):
|
||||
arr = np.arange(0, 255, dtype=np.uint8)
|
||||
colormap = cv.applyColorMap(arr, cv.COLORMAP_HSV)
|
||||
colormap = colormap.squeeze(1)
|
||||
|
||||
flow = flow.squeeze(0)
|
||||
fx, fy = flow[0, ...], flow[1, ...]
|
||||
rad = np.sqrt(fx**2 + fy**2)
|
||||
maxrad = rad.max() if rad.max() != 0 else 1
|
||||
|
||||
ncols = arr.size
|
||||
rad = rad[..., np.newaxis] / maxrad
|
||||
a = np.arctan2(-fy / maxrad, -fx / maxrad) / np.pi
|
||||
fk = (a + 1) / 2.0 * (ncols - 1)
|
||||
k0 = fk.astype(np.int)
|
||||
k1 = (k0 + 1) % ncols
|
||||
f = fk[..., np.newaxis] - k0[..., np.newaxis]
|
||||
|
||||
col0 = colormap[k0] / 255.0
|
||||
col1 = colormap[k1] / 255.0
|
||||
col = (1 - f) * col0 + f * col1
|
||||
col = np.where(rad <= 1, 1 - rad * (1 - col), col * 0.75)
|
||||
output = (255.0 * col).astype(np.uint8)
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Use this script to calculate optical flow using FlowNetv2',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('-input', '-i', required=True, help='Path to input video file. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--height', default=320, type=int, help='Input height')
|
||||
parser.add_argument('--width', default=448, type=int, help='Input width')
|
||||
parser.add_argument('--proto', '-p', default='FlowNet2_deploy_anysize.prototxt', help='Path to prototxt.')
|
||||
parser.add_argument('--model', '-m', default='FlowNet2_weights.caffemodel', help='Path to caffemodel.')
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
if not os.path.isfile(args.model) or not os.path.isfile(args.proto):
|
||||
raise OSError("Prototxt or caffemodel not exist")
|
||||
|
||||
winName = 'Calculation optical flow in OpenCV'
|
||||
cv.namedWindow(winName, cv.WINDOW_NORMAL)
|
||||
cap = cv.VideoCapture(args.input if args.input else 0)
|
||||
hasFrame, first_frame = cap.read()
|
||||
|
||||
divisor = 64.
|
||||
var = {}
|
||||
var['ADAPTED_WIDTH'] = int(np.ceil(args.width/divisor) * divisor)
|
||||
var['ADAPTED_HEIGHT'] = int(np.ceil(args.height/divisor) * divisor)
|
||||
var['SCALE_WIDTH'] = args.width / float(var['ADAPTED_WIDTH'])
|
||||
var['SCALE_HEIGHT'] = args.height / float(var['ADAPTED_HEIGHT'])
|
||||
|
||||
config = ''
|
||||
proto = open(args.proto).readlines()
|
||||
for line in proto:
|
||||
for key, value in var.items():
|
||||
tag = "$%s$" % key
|
||||
line = line.replace(tag, str(value))
|
||||
config += line
|
||||
|
||||
caffemodel = open(args.model, 'rb').read()
|
||||
|
||||
opt_flow = OpticalFlow(bytearray(config.encode()), caffemodel, var['ADAPTED_HEIGHT'], var['ADAPTED_WIDTH'])
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, second_frame = cap.read()
|
||||
if not hasFrame:
|
||||
break
|
||||
flow = opt_flow.compute_flow(first_frame, second_frame)
|
||||
first_frame = second_frame
|
||||
cv.imshow(winName, flow)
|
||||
240
samples/dnn/person_reid.cpp
Normal file
240
samples/dnn/person_reid.cpp
Normal file
@@ -0,0 +1,240 @@
|
||||
//
|
||||
// You can download a baseline ReID model and sample input from:
|
||||
// https://github.com/ReID-Team/ReID_extra_testdata
|
||||
//
|
||||
// Authors of samples and Youtu ReID baseline:
|
||||
// Xing Sun <winfredsun@tencent.com>
|
||||
// Feng Zheng <zhengf@sustech.edu.cn>
|
||||
// Xinyang Jiang <sevjiang@tencent.com>
|
||||
// Fufu Yu <fufuyu@tencent.com>
|
||||
// Enwei Zhang <miyozhang@tencent.com>
|
||||
//
|
||||
// Copyright (C) 2020-2021, Tencent.
|
||||
// Copyright (C) 2020-2021, SUSTech.
|
||||
//
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <opencv2/dnn.hpp>
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::dnn;
|
||||
|
||||
const char* keys =
|
||||
"{help h | | show help message}"
|
||||
"{model m | | network model}"
|
||||
"{query_list q | | list of query images}"
|
||||
"{gallery_list g | | list of gallery images}"
|
||||
"{batch_size | 32 | batch size of each inference}"
|
||||
"{resize_h | 256 | resize input to specific height.}"
|
||||
"{resize_w | 128 | resize input to specific width.}"
|
||||
"{topk k | 5 | number of gallery images showed in visualization}"
|
||||
"{output_dir | | path for visualization(it should be existed)}"
|
||||
"{backend b | 0 | choose one of computation backends: "
|
||||
"0: automatically (by default), "
|
||||
"1: Halide language (http://halide-lang.org/), "
|
||||
"2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"3: OpenCV implementation ,"
|
||||
"5: CUDA }"
|
||||
"{target t | 0 | choose one of target computation devices: "
|
||||
"0: CPU target (by default), "
|
||||
"1: OpenCL, "
|
||||
"2: OpenCL fp16 (half-float precision), "
|
||||
"6: CUDA ,"
|
||||
"7: CUDA fp16 (half-float preprocess) }";
|
||||
|
||||
namespace cv{
|
||||
namespace reid{
|
||||
|
||||
static Mat preprocess(const Mat& img)
|
||||
{
|
||||
const double mean[3] = {0.485, 0.456, 0.406};
|
||||
const double std[3] = {0.229, 0.224, 0.225};
|
||||
Mat ret = Mat(img.rows, img.cols, CV_32FC3);
|
||||
for (int y = 0; y < ret.rows; y ++)
|
||||
{
|
||||
for (int x = 0; x < ret.cols; x++)
|
||||
{
|
||||
for (int c = 0; c < 3; c++)
|
||||
{
|
||||
ret.at<Vec3f>(y,x)[c] = (float)((img.at<Vec3b>(y,x)[c] / 255.0 - mean[2 - c]) / std[2 - c]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static std::vector<float> normalization(const std::vector<float>& feature)
|
||||
{
|
||||
std::vector<float> ret;
|
||||
float sum = 0.0;
|
||||
for(int i = 0; i < (int)feature.size(); i++)
|
||||
{
|
||||
sum += feature[i] * feature[i];
|
||||
}
|
||||
sum = sqrt(sum);
|
||||
for(int i = 0; i < (int)feature.size(); i++)
|
||||
{
|
||||
ret.push_back(feature[i] / sum);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void extractFeatures(const std::vector<std::string>& imglist, Net* net, const int& batch_size, const int& resize_h, const int& resize_w, std::vector<std::vector<float>>& features)
|
||||
{
|
||||
for(int st = 0; st < (int)imglist.size(); st += batch_size)
|
||||
{
|
||||
std::vector<Mat> batch;
|
||||
for(int delta = 0; delta < batch_size && st + delta < (int)imglist.size(); delta++)
|
||||
{
|
||||
Mat img = imread(imglist[st + delta]);
|
||||
batch.push_back(preprocess(img));
|
||||
}
|
||||
Mat blob = dnn::blobFromImages(batch, 1.0, Size(resize_w, resize_h), Scalar(0.0,0.0,0.0), true, false, CV_32F);
|
||||
net->setInput(blob);
|
||||
Mat out = net->forward();
|
||||
for(int i = 0; i < (int)out.size().height; i++)
|
||||
{
|
||||
std::vector<float> temp_feature;
|
||||
for(int j = 0; j < (int)out.size().width; j++)
|
||||
{
|
||||
temp_feature.push_back(out.at<float>(i,j));
|
||||
}
|
||||
features.push_back(normalization(temp_feature));
|
||||
}
|
||||
}
|
||||
return ;
|
||||
}
|
||||
|
||||
static void getNames(const std::string& ImageList, std::vector<std::string>& result)
|
||||
{
|
||||
std::ifstream img_in(ImageList);
|
||||
std::string img_name;
|
||||
while(img_in >> img_name)
|
||||
{
|
||||
result.push_back(img_name);
|
||||
}
|
||||
return ;
|
||||
}
|
||||
|
||||
static float similarity(const std::vector<float>& feature1, const std::vector<float>& feature2)
|
||||
{
|
||||
float result = 0.0;
|
||||
for(int i = 0; i < (int)feature1.size(); i++)
|
||||
{
|
||||
result += feature1[i] * feature2[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static void getTopK(const std::vector<std::vector<float>>& queryFeatures, const std::vector<std::vector<float>>& galleryFeatures, const int& topk, std::vector<std::vector<int>>& result)
|
||||
{
|
||||
for(int i = 0; i < (int)queryFeatures.size(); i++)
|
||||
{
|
||||
std::vector<float> similarityList;
|
||||
std::vector<int> index;
|
||||
for(int j = 0; j < (int)galleryFeatures.size(); j++)
|
||||
{
|
||||
similarityList.push_back(similarity(queryFeatures[i], galleryFeatures[j]));
|
||||
index.push_back(j);
|
||||
}
|
||||
sort(index.begin(), index.end(), [&](int x,int y){return similarityList[x] > similarityList[y];});
|
||||
std::vector<int> topk_result;
|
||||
for(int j = 0; j < min(topk, (int)index.size()); j++)
|
||||
{
|
||||
topk_result.push_back(index[j]);
|
||||
}
|
||||
result.push_back(topk_result);
|
||||
}
|
||||
return ;
|
||||
}
|
||||
|
||||
static void addBorder(const Mat& img, const Scalar& color, Mat& result)
|
||||
{
|
||||
const int bordersize = 5;
|
||||
copyMakeBorder(img, result, bordersize, bordersize, bordersize, bordersize, cv::BORDER_CONSTANT, color);
|
||||
return ;
|
||||
}
|
||||
|
||||
static void drawRankList(const std::string& queryName, const std::vector<std::string>& galleryImageNames, const std::vector<int>& topk_index, const int& resize_h, const int& resize_w, Mat& result)
|
||||
{
|
||||
const Size outputSize = Size(resize_w, resize_h);
|
||||
Mat q_img = imread(queryName), temp_img;
|
||||
resize(q_img, temp_img, outputSize);
|
||||
addBorder(temp_img, Scalar(0,0,0), q_img);
|
||||
putText(q_img, "Query", Point(10, 30), FONT_HERSHEY_COMPLEX, 1.0, Scalar(0,255,0), 2);
|
||||
std::vector<Mat> Images;
|
||||
Images.push_back(q_img);
|
||||
for(int i = 0; i < (int)topk_index.size(); i++)
|
||||
{
|
||||
Mat g_img = imread(galleryImageNames[topk_index[i]]);
|
||||
resize(g_img, temp_img, outputSize);
|
||||
addBorder(temp_img, Scalar(255,255,255), g_img);
|
||||
putText(g_img, "G" + std::to_string(i), Point(10, 30), FONT_HERSHEY_COMPLEX, 1.0, Scalar(0,255,0), 2);
|
||||
Images.push_back(g_img);
|
||||
}
|
||||
hconcat(Images, result);
|
||||
return ;
|
||||
}
|
||||
|
||||
static void visualization(const std::vector<std::vector<int>>& topk, const std::vector<std::string>& queryImageNames, const std::vector<std::string>& galleryImageNames, const std::string& output_dir, const int& resize_h, const int& resize_w)
|
||||
{
|
||||
for(int i = 0; i < (int)queryImageNames.size(); i++)
|
||||
{
|
||||
Mat img;
|
||||
drawRankList(queryImageNames[i], galleryImageNames, topk[i], resize_h, resize_w, img);
|
||||
std::string output_path = output_dir + "/" + queryImageNames[i].substr(queryImageNames[i].rfind("/")+1);
|
||||
imwrite(output_path, img);
|
||||
}
|
||||
return ;
|
||||
}
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
// Parse command line arguments.
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
parser = CommandLineParser(argc, argv, keys);
|
||||
parser.about("Use this script to run ReID networks using OpenCV.");
|
||||
|
||||
const std::string modelPath = parser.get<String>("model");
|
||||
const std::string queryImageList = parser.get<String>("query_list");
|
||||
const std::string galleryImageList = parser.get<String>("gallery_list");
|
||||
const int backend = parser.get<int>("backend");
|
||||
const int target = parser.get<int>("target");
|
||||
const int batch_size = parser.get<int>("batch_size");
|
||||
const int resize_h = parser.get<int>("resize_h");
|
||||
const int resize_w = parser.get<int>("resize_w");
|
||||
const int topk = parser.get<int>("topk");
|
||||
const std::string output_dir= parser.get<String>("output_dir");
|
||||
|
||||
std::vector<std::string> queryImageNames;
|
||||
reid::getNames(queryImageList, queryImageNames);
|
||||
std::vector<std::string> galleryImageNames;
|
||||
reid::getNames(galleryImageList, galleryImageNames);
|
||||
|
||||
dnn::Net net = dnn::readNet(modelPath);
|
||||
net.setPreferableBackend(backend);
|
||||
net.setPreferableTarget(target);
|
||||
|
||||
std::vector<std::vector<float>> queryFeatures;
|
||||
reid::extractFeatures(queryImageNames, &net, batch_size, resize_h, resize_w, queryFeatures);
|
||||
std::vector<std::vector<float>> galleryFeatures;
|
||||
reid::extractFeatures(galleryImageNames, &net, batch_size, resize_h, resize_w, galleryFeatures);
|
||||
|
||||
std::vector<std::vector<int>> topkResult;
|
||||
reid::getTopK(queryFeatures, galleryFeatures, topk, topkResult);
|
||||
reid::visualization(topkResult, queryImageNames, galleryImageNames, output_dir, resize_h, resize_w);
|
||||
|
||||
return 0;
|
||||
}
|
||||
236
samples/dnn/person_reid.py
Normal file
236
samples/dnn/person_reid.py
Normal file
@@ -0,0 +1,236 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
You can download a baseline ReID model and sample input from:
|
||||
https://github.com/ReID-Team/ReID_extra_testdata
|
||||
|
||||
Authors of samples and Youtu ReID baseline:
|
||||
Xing Sun <winfredsun@tencent.com>
|
||||
Feng Zheng <zhengf@sustech.edu.cn>
|
||||
Xinyang Jiang <sevjiang@tencent.com>
|
||||
Fufu Yu <fufuyu@tencent.com>
|
||||
Enwei Zhang <miyozhang@tencent.com>
|
||||
|
||||
Copyright (C) 2020-2021, Tencent.
|
||||
Copyright (C) 2020-2021, SUSTech.
|
||||
'''
|
||||
import argparse
|
||||
import os.path
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
|
||||
backends = (cv.dnn.DNN_BACKEND_DEFAULT,
|
||||
cv.dnn.DNN_BACKEND_INFERENCE_ENGINE,
|
||||
cv.dnn.DNN_BACKEND_OPENCV,
|
||||
cv.dnn.DNN_BACKEND_CUDA)
|
||||
|
||||
targets = (cv.dnn.DNN_TARGET_CPU,
|
||||
cv.dnn.DNN_TARGET_OPENCL,
|
||||
cv.dnn.DNN_TARGET_OPENCL_FP16,
|
||||
cv.dnn.DNN_TARGET_MYRIAD,
|
||||
cv.dnn.DNN_TARGET_HDDL,
|
||||
cv.dnn.DNN_TARGET_CUDA,
|
||||
cv.dnn.DNN_TARGET_CUDA_FP16)
|
||||
|
||||
MEAN = (0.485, 0.456, 0.406)
|
||||
STD = (0.229, 0.224, 0.225)
|
||||
|
||||
def preprocess(images, height, width):
|
||||
"""
|
||||
Create 4-dimensional blob from image
|
||||
:param image: input image
|
||||
:param height: the height of the resized input image
|
||||
:param width: the width of the resized input image
|
||||
"""
|
||||
img_list = []
|
||||
for image in images:
|
||||
image = cv.resize(image, (width, height))
|
||||
img_list.append(image[:, :, ::-1])
|
||||
|
||||
images = np.array(img_list)
|
||||
images = (images / 255.0 - MEAN) / STD
|
||||
|
||||
input = cv.dnn.blobFromImages(images.astype(np.float32), ddepth = cv.CV_32F)
|
||||
return input
|
||||
|
||||
def extract_feature(img_dir, model_path, batch_size = 32, resize_h = 384, resize_w = 128, backend=cv.dnn.DNN_BACKEND_OPENCV, target=cv.dnn.DNN_TARGET_CPU):
|
||||
"""
|
||||
Extract features from images in a target directory
|
||||
:param img_dir: the input image directory
|
||||
:param model_path: path to ReID model
|
||||
:param batch_size: the batch size for each network inference iteration
|
||||
:param resize_h: the height of the input image
|
||||
:param resize_w: the width of the input image
|
||||
:param backend: name of computation backend
|
||||
:param target: name of computation target
|
||||
"""
|
||||
feat_list = []
|
||||
path_list = os.listdir(img_dir)
|
||||
path_list = [os.path.join(img_dir, img_name) for img_name in path_list]
|
||||
count = 0
|
||||
|
||||
for i in range(0, len(path_list), batch_size):
|
||||
print('Feature Extraction for images in', img_dir, 'Batch:', count, '/', len(path_list))
|
||||
batch = path_list[i : min(i + batch_size, len(path_list))]
|
||||
imgs = read_data(batch)
|
||||
inputs = preprocess(imgs, resize_h, resize_w)
|
||||
|
||||
feat = run_net(inputs, model_path, backend, target)
|
||||
|
||||
feat_list.append(feat)
|
||||
count += batch_size
|
||||
|
||||
feats = np.concatenate(feat_list, axis = 0)
|
||||
return feats, path_list
|
||||
|
||||
def run_net(inputs, model_path, backend=cv.dnn.DNN_BACKEND_OPENCV, target=cv.dnn.DNN_TARGET_CPU):
|
||||
"""
|
||||
Forword propagation for a batch of images.
|
||||
:param inputs: input batch of images
|
||||
:param model_path: path to ReID model
|
||||
:param backend: name of computation backend
|
||||
:param target: name of computation target
|
||||
"""
|
||||
net = cv.dnn.readNet(model_path)
|
||||
net.setPreferableBackend(backend)
|
||||
net.setPreferableTarget(target)
|
||||
net.setInput(inputs)
|
||||
out = net.forward()
|
||||
out = np.reshape(out, (out.shape[0], out.shape[1]))
|
||||
return out
|
||||
|
||||
def read_data(path_list):
|
||||
"""
|
||||
Read all images from a directory into a list
|
||||
:param path_list: the list of image path
|
||||
"""
|
||||
img_list = []
|
||||
for img_path in path_list:
|
||||
img = cv.imread(img_path)
|
||||
if img is None:
|
||||
continue
|
||||
img_list.append(img)
|
||||
return img_list
|
||||
|
||||
def normalize(nparray, order=2, axis=0):
|
||||
"""
|
||||
Normalize a N-D numpy array along the specified axis.
|
||||
:param nparry: the array of vectors to be normalized
|
||||
:param order: order of the norm
|
||||
:param axis: the axis of x along which to compute the vector norms
|
||||
"""
|
||||
norm = np.linalg.norm(nparray, ord=order, axis=axis, keepdims=True)
|
||||
return nparray / (norm + np.finfo(np.float32).eps)
|
||||
|
||||
def similarity(array1, array2):
|
||||
"""
|
||||
Compute the euclidean or cosine distance of all pairs.
|
||||
:param array1: numpy array with shape [m1, n]
|
||||
:param array2: numpy array with shape [m2, n]
|
||||
Returns:
|
||||
numpy array with shape [m1, m2]
|
||||
"""
|
||||
array1 = normalize(array1, axis=1)
|
||||
array2 = normalize(array2, axis=1)
|
||||
dist = np.matmul(array1, array2.T)
|
||||
return dist
|
||||
|
||||
def topk(query_feat, gallery_feat, topk = 5):
|
||||
"""
|
||||
Return the index of top K gallery images most similar to the query images
|
||||
:param query_feat: array of feature vectors of query images
|
||||
:param gallery_feat: array of feature vectors of gallery images
|
||||
:param topk: number of gallery images to return
|
||||
"""
|
||||
sim = similarity(query_feat, gallery_feat)
|
||||
index = np.argsort(-sim, axis = 1)
|
||||
return [i[0:int(topk)] for i in index]
|
||||
|
||||
def drawRankList(query_name, gallery_list, output_size = (128, 384)):
|
||||
"""
|
||||
Draw the rank list
|
||||
:param query_name: path of the query image
|
||||
:param gallery_name: path of the gallery image
|
||||
"param output_size: the output size of each image in the rank list
|
||||
"""
|
||||
def addBorder(im, color):
|
||||
bordersize = 5
|
||||
border = cv.copyMakeBorder(
|
||||
im,
|
||||
top = bordersize,
|
||||
bottom = bordersize,
|
||||
left = bordersize,
|
||||
right = bordersize,
|
||||
borderType = cv.BORDER_CONSTANT,
|
||||
value = color
|
||||
)
|
||||
return border
|
||||
query_img = cv.imread(query_name)
|
||||
query_img = cv.resize(query_img, output_size)
|
||||
query_img = addBorder(query_img, [0, 0, 0])
|
||||
cv.putText(query_img, 'Query', (10, 30), cv.FONT_HERSHEY_COMPLEX, 1., (0,255,0), 2)
|
||||
|
||||
gallery_img_list = []
|
||||
for i, gallery_name in enumerate(gallery_list):
|
||||
gallery_img = cv.imread(gallery_name)
|
||||
gallery_img = cv.resize(gallery_img, output_size)
|
||||
gallery_img = addBorder(gallery_img, [255, 255, 255])
|
||||
cv.putText(gallery_img, 'G%02d'%i, (10, 30), cv.FONT_HERSHEY_COMPLEX, 1., (0,255,0), 2)
|
||||
gallery_img_list.append(gallery_img)
|
||||
ret = np.concatenate([query_img] + gallery_img_list, axis = 1)
|
||||
return ret
|
||||
|
||||
|
||||
def visualization(topk_idx, query_names, gallery_names, output_dir = 'vis'):
|
||||
"""
|
||||
Visualize the retrieval results with the person ReID model
|
||||
:param topk_idx: the index of ranked gallery images for each query image
|
||||
:param query_names: the list of paths of query images
|
||||
:param gallery_names: the list of paths of gallery images
|
||||
:param output_dir: the path to save the visualize results
|
||||
"""
|
||||
if not os.path.exists(output_dir):
|
||||
os.mkdir(output_dir)
|
||||
for i, idx in enumerate(topk_idx):
|
||||
query_name = query_names[i]
|
||||
topk_names = [gallery_names[j] for j in idx]
|
||||
vis_img = drawRankList(query_name, topk_names)
|
||||
output_path = os.path.join(output_dir, '%03d_%s'%(i, os.path.basename(query_name)))
|
||||
cv.imwrite(output_path, vis_img)
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Use this script to run human parsing using JPPNet',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--query_dir', '-q', required=True, help='Path to query image.')
|
||||
parser.add_argument('--gallery_dir', '-g', required=True, help='Path to gallery directory.')
|
||||
parser.add_argument('--resize_h', default = 256, help='The height of the input for model inference.')
|
||||
parser.add_argument('--resize_w', default = 128, help='The width of the input for model inference')
|
||||
parser.add_argument('--model', '-m', default='reid.onnx', help='Path to pb model.')
|
||||
parser.add_argument('--visualization_dir', default='vis', help='Path for the visualization results')
|
||||
parser.add_argument('--topk', default=10, help='Number of images visualized in the rank list')
|
||||
parser.add_argument('--batchsize', default=32, help='The batch size of each inference')
|
||||
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
|
||||
help="Choose one of computation backends: "
|
||||
"%d: automatically (by default), "
|
||||
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"%d: OpenCV implementation"
|
||||
"%d: CUDA backend"% backends)
|
||||
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
|
||||
help='Choose one of target computation devices: '
|
||||
'%d: CPU target (by default), '
|
||||
'%d: OpenCL, '
|
||||
'%d: OpenCL fp16 (half-float precision), '
|
||||
'%d: NCS2 VPU, '
|
||||
'%d: HDDL VPU'
|
||||
'%d: CUDA,'
|
||||
'%d: CUDA FP16,'
|
||||
% targets)
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
if not os.path.isfile(args.model):
|
||||
raise OSError("Model not exist")
|
||||
|
||||
query_feat, query_names = extract_feature(args.query_dir, args.model, args.batchsize, args.resize_h, args.resize_w, args.backend, args.target)
|
||||
gallery_feat, gallery_names = extract_feature(args.gallery_dir, args.model, args.batchsize, args.resize_h, args.resize_w, args.backend, args.target)
|
||||
|
||||
topk_idx = topk(query_feat, gallery_feat, args.topk)
|
||||
visualization(topk_idx, query_names, gallery_names, output_dir = args.visualization_dir)
|
||||
165
samples/dnn/scene_text_detection.cpp
Normal file
165
samples/dnn/scene_text_detection.cpp
Normal file
@@ -0,0 +1,165 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <opencv2/dnn/dnn.hpp>
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::dnn;
|
||||
|
||||
std::string keys =
|
||||
"{ help h | | Print help message. }"
|
||||
"{ inputImage i | | Path to an input image. Skip this argument to capture frames from a camera. }"
|
||||
"{ modelPath mp | | Path to a binary .onnx file contains trained DB detector model. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
|
||||
"{ inputHeight ih |736| image height of the model input. It should be multiple by 32.}"
|
||||
"{ inputWidth iw |736| image width of the model input. It should be multiple by 32.}"
|
||||
"{ binaryThreshold bt |0.3| Confidence threshold of the binary map. }"
|
||||
"{ polygonThreshold pt |0.5| Confidence threshold of polygons. }"
|
||||
"{ maxCandidate max |200| Max candidates of polygons. }"
|
||||
"{ unclipRatio ratio |2.0| unclip ratio. }"
|
||||
"{ evaluate e |false| false: predict with input images; true: evaluate on benchmarks. }"
|
||||
"{ evalDataPath edp | | Path to benchmarks for evaluation. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";
|
||||
|
||||
static
|
||||
void split(const std::string& s, char delimiter, std::vector<std::string>& elems)
|
||||
{
|
||||
elems.clear();
|
||||
size_t prev_pos = 0;
|
||||
size_t pos = 0;
|
||||
while ((pos = s.find(delimiter, prev_pos)) != std::string::npos)
|
||||
{
|
||||
elems.emplace_back(s.substr(prev_pos, pos - prev_pos));
|
||||
prev_pos = pos + 1;
|
||||
}
|
||||
if (prev_pos < s.size())
|
||||
elems.emplace_back(s.substr(prev_pos, s.size() - prev_pos));
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
// Parse arguments
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
parser.about("Use this script to run the official PyTorch implementation (https://github.com/MhLiao/DB) of "
|
||||
"Real-time Scene Text Detection with Differentiable Binarization (https://arxiv.org/abs/1911.08947)\n"
|
||||
"The current version of this script is a variant of the original network without deformable convolution");
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
float binThresh = parser.get<float>("binaryThreshold");
|
||||
float polyThresh = parser.get<float>("polygonThreshold");
|
||||
uint maxCandidates = parser.get<uint>("maxCandidate");
|
||||
String modelPath = parser.get<String>("modelPath");
|
||||
double unclipRatio = parser.get<double>("unclipRatio");
|
||||
int height = parser.get<int>("inputHeight");
|
||||
int width = parser.get<int>("inputWidth");
|
||||
|
||||
if (!parser.check())
|
||||
{
|
||||
parser.printErrors();
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Load the network
|
||||
CV_Assert(!modelPath.empty());
|
||||
TextDetectionModel_DB detector(modelPath);
|
||||
detector.setBinaryThreshold(binThresh)
|
||||
.setPolygonThreshold(polyThresh)
|
||||
.setUnclipRatio(unclipRatio)
|
||||
.setMaxCandidates(maxCandidates);
|
||||
|
||||
double scale = 1.0 / 255.0;
|
||||
Size inputSize = Size(width, height);
|
||||
Scalar mean = Scalar(122.67891434, 116.66876762, 104.00698793);
|
||||
detector.setInputParams(scale, inputSize, mean);
|
||||
|
||||
// Create a window
|
||||
static const std::string winName = "TextDetectionModel";
|
||||
|
||||
if (parser.get<bool>("evaluate")) {
|
||||
// for evaluation
|
||||
String evalDataPath = parser.get<String>("evalDataPath");
|
||||
CV_Assert(!evalDataPath.empty());
|
||||
String testListPath = evalDataPath + "/test_list.txt";
|
||||
std::ifstream testList;
|
||||
testList.open(testListPath);
|
||||
CV_Assert(testList.is_open());
|
||||
|
||||
// Create a window for showing groundtruth
|
||||
static const std::string winNameGT = "GT";
|
||||
|
||||
String testImgPath;
|
||||
while (std::getline(testList, testImgPath)) {
|
||||
String imgPath = evalDataPath + "/test_images/" + testImgPath;
|
||||
std::cout << "Image Path: " << imgPath << std::endl;
|
||||
|
||||
Mat frame = imread(samples::findFile(imgPath), IMREAD_COLOR);
|
||||
CV_Assert(!frame.empty());
|
||||
Mat src = frame.clone();
|
||||
|
||||
// Inference
|
||||
std::vector<std::vector<Point>> results;
|
||||
detector.detect(frame, results);
|
||||
|
||||
polylines(frame, results, true, Scalar(0, 255, 0), 2);
|
||||
imshow(winName, frame);
|
||||
|
||||
// load groundtruth
|
||||
String imgName = testImgPath.substr(0, testImgPath.length() - 4);
|
||||
String gtPath = evalDataPath + "/test_gts/" + imgName + ".txt";
|
||||
// std::cout << gtPath << std::endl;
|
||||
std::ifstream gtFile;
|
||||
gtFile.open(gtPath);
|
||||
CV_Assert(gtFile.is_open());
|
||||
|
||||
std::vector<std::vector<Point>> gts;
|
||||
String gtLine;
|
||||
while (std::getline(gtFile, gtLine)) {
|
||||
size_t splitLoc = gtLine.find_last_of(',');
|
||||
String text = gtLine.substr(splitLoc+1);
|
||||
if ( text == "###\r" || text == "1") {
|
||||
// ignore difficult instances
|
||||
continue;
|
||||
}
|
||||
gtLine = gtLine.substr(0, splitLoc);
|
||||
|
||||
std::vector<std::string> v;
|
||||
split(gtLine, ',', v);
|
||||
|
||||
std::vector<int> loc;
|
||||
std::vector<Point> pts;
|
||||
for (auto && s : v) {
|
||||
loc.push_back(atoi(s.c_str()));
|
||||
}
|
||||
for (size_t i = 0; i < loc.size() / 2; i++) {
|
||||
pts.push_back(Point(loc[2 * i], loc[2 * i + 1]));
|
||||
}
|
||||
gts.push_back(pts);
|
||||
}
|
||||
polylines(src, gts, true, Scalar(0, 255, 0), 2);
|
||||
imshow(winNameGT, src);
|
||||
|
||||
waitKey();
|
||||
}
|
||||
} else {
|
||||
// Open an image file
|
||||
CV_Assert(parser.has("inputImage"));
|
||||
Mat frame = imread(samples::findFile(parser.get<String>("inputImage")));
|
||||
CV_Assert(!frame.empty());
|
||||
|
||||
// Detect
|
||||
std::vector<std::vector<Point>> results;
|
||||
detector.detect(frame, results);
|
||||
|
||||
polylines(frame, results, true, Scalar(0, 255, 0), 2);
|
||||
imshow(winName, frame);
|
||||
waitKey();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
144
samples/dnn/scene_text_recognition.cpp
Normal file
144
samples/dnn/scene_text_recognition.cpp
Normal file
@@ -0,0 +1,144 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <opencv2/dnn/dnn.hpp>
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::dnn;
|
||||
|
||||
String keys =
|
||||
"{ help h | | Print help message. }"
|
||||
"{ inputImage i | | Path to an input image. Skip this argument to capture frames from a camera. }"
|
||||
"{ modelPath mp | | Path to a binary .onnx file contains trained CRNN text recognition model. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
|
||||
"{ RGBInput rgb |0| 0: imread with flags=IMREAD_GRAYSCALE; 1: imread with flags=IMREAD_COLOR. }"
|
||||
"{ evaluate e |false| false: predict with input images; true: evaluate on benchmarks. }"
|
||||
"{ evalDataPath edp | | Path to benchmarks for evaluation. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
|
||||
"{ vocabularyPath vp | alphabet_36.txt | Path to recognition vocabulary. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";
|
||||
|
||||
String convertForEval(String &input);
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
// Parse arguments
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
parser.about("Use this script to run the PyTorch implementation of "
|
||||
"An End-to-End Trainable Neural Network for Image-based SequenceRecognition and Its Application to Scene Text Recognition "
|
||||
"(https://arxiv.org/abs/1507.05717)");
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
String modelPath = parser.get<String>("modelPath");
|
||||
String vocPath = parser.get<String>("vocabularyPath");
|
||||
int imreadRGB = parser.get<int>("RGBInput");
|
||||
|
||||
if (!parser.check())
|
||||
{
|
||||
parser.printErrors();
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Load the network
|
||||
CV_Assert(!modelPath.empty());
|
||||
TextRecognitionModel recognizer(modelPath);
|
||||
|
||||
// Load vocabulary
|
||||
CV_Assert(!vocPath.empty());
|
||||
std::ifstream vocFile;
|
||||
vocFile.open(samples::findFile(vocPath));
|
||||
CV_Assert(vocFile.is_open());
|
||||
String vocLine;
|
||||
std::vector<String> vocabulary;
|
||||
while (std::getline(vocFile, vocLine)) {
|
||||
vocabulary.push_back(vocLine);
|
||||
}
|
||||
recognizer.setVocabulary(vocabulary);
|
||||
recognizer.setDecodeType("CTC-greedy");
|
||||
|
||||
// Set parameters
|
||||
double scale = 1.0 / 127.5;
|
||||
Scalar mean = Scalar(127.5, 127.5, 127.5);
|
||||
Size inputSize = Size(100, 32);
|
||||
recognizer.setInputParams(scale, inputSize, mean);
|
||||
|
||||
if (parser.get<bool>("evaluate"))
|
||||
{
|
||||
// For evaluation
|
||||
String evalDataPath = parser.get<String>("evalDataPath");
|
||||
CV_Assert(!evalDataPath.empty());
|
||||
String gtPath = evalDataPath + "/test_gts.txt";
|
||||
std::ifstream evalGts;
|
||||
evalGts.open(gtPath);
|
||||
CV_Assert(evalGts.is_open());
|
||||
|
||||
String gtLine;
|
||||
int cntRight=0, cntAll=0;
|
||||
TickMeter timer;
|
||||
timer.reset();
|
||||
|
||||
while (std::getline(evalGts, gtLine)) {
|
||||
size_t splitLoc = gtLine.find_first_of(' ');
|
||||
String imgPath = evalDataPath + '/' + gtLine.substr(0, splitLoc);
|
||||
String gt = gtLine.substr(splitLoc+1);
|
||||
|
||||
// Inference
|
||||
Mat frame = imread(samples::findFile(imgPath), imreadRGB);
|
||||
CV_Assert(!frame.empty());
|
||||
timer.start();
|
||||
std::string recognitionResult = recognizer.recognize(frame);
|
||||
timer.stop();
|
||||
|
||||
if (gt == convertForEval(recognitionResult))
|
||||
cntRight++;
|
||||
|
||||
cntAll++;
|
||||
}
|
||||
std::cout << "Accuracy(%): " << (double)(cntRight) / (double)(cntAll) << std::endl;
|
||||
std::cout << "Average Inference Time(ms): " << timer.getTimeMilli() / (double)(cntAll) << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Create a window
|
||||
static const std::string winName = "Input Cropped Image";
|
||||
|
||||
// Open an image file
|
||||
CV_Assert(parser.has("inputImage"));
|
||||
Mat frame = imread(samples::findFile(parser.get<String>("inputImage")), imreadRGB);
|
||||
CV_Assert(!frame.empty());
|
||||
|
||||
// Recognition
|
||||
std::string recognitionResult = recognizer.recognize(frame);
|
||||
|
||||
imshow(winName, frame);
|
||||
std::cout << "Predition: '" << recognitionResult << "'" << std::endl;
|
||||
waitKey();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Convert the predictions to lower case, and remove other characters.
|
||||
// Only for Evaluation
|
||||
String convertForEval(String & input)
|
||||
{
|
||||
String output;
|
||||
for (uint i = 0; i < input.length(); i++){
|
||||
char ch = input[i];
|
||||
if ((int)ch >= 97 && (int)ch <= 122) {
|
||||
output.push_back(ch);
|
||||
} else if ((int)ch >= 65 && (int)ch <= 90) {
|
||||
output.push_back((char)(ch + 32));
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
169
samples/dnn/scene_text_spotting.cpp
Normal file
169
samples/dnn/scene_text_spotting.cpp
Normal file
@@ -0,0 +1,169 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <opencv2/dnn/dnn.hpp>
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::dnn;
|
||||
|
||||
std::string keys =
|
||||
"{ help h | | Print help message. }"
|
||||
"{ inputImage i | | Path to an input image. Skip this argument to capture frames from a camera. }"
|
||||
"{ detModelPath dmp | | Path to a binary .onnx model for detection. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
|
||||
"{ recModelPath rmp | | Path to a binary .onnx model for recognition. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
|
||||
"{ inputHeight ih |736| image height of the model input. It should be multiple by 32.}"
|
||||
"{ inputWidth iw |736| image width of the model input. It should be multiple by 32.}"
|
||||
"{ RGBInput rgb |0| 0: imread with flags=IMREAD_GRAYSCALE; 1: imread with flags=IMREAD_COLOR. }"
|
||||
"{ binaryThreshold bt |0.3| Confidence threshold of the binary map. }"
|
||||
"{ polygonThreshold pt |0.5| Confidence threshold of polygons. }"
|
||||
"{ maxCandidate max |200| Max candidates of polygons. }"
|
||||
"{ unclipRatio ratio |2.0| unclip ratio. }"
|
||||
"{ vocabularyPath vp | alphabet_36.txt | Path to benchmarks for evaluation. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";
|
||||
|
||||
void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result);
|
||||
bool sortPts(const Point& p1, const Point& p2);
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
// Parse arguments
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
parser.about("Use this script to run an end-to-end inference sample of textDetectionModel and textRecognitionModel APIs\n"
|
||||
"Use -h for more information");
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
float binThresh = parser.get<float>("binaryThreshold");
|
||||
float polyThresh = parser.get<float>("polygonThreshold");
|
||||
uint maxCandidates = parser.get<uint>("maxCandidate");
|
||||
String detModelPath = parser.get<String>("detModelPath");
|
||||
String recModelPath = parser.get<String>("recModelPath");
|
||||
String vocPath = parser.get<String>("vocabularyPath");
|
||||
double unclipRatio = parser.get<double>("unclipRatio");
|
||||
int height = parser.get<int>("inputHeight");
|
||||
int width = parser.get<int>("inputWidth");
|
||||
int imreadRGB = parser.get<int>("RGBInput");
|
||||
|
||||
if (!parser.check())
|
||||
{
|
||||
parser.printErrors();
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Load networks
|
||||
CV_Assert(!detModelPath.empty());
|
||||
TextDetectionModel_DB detector(detModelPath);
|
||||
detector.setBinaryThreshold(binThresh)
|
||||
.setPolygonThreshold(polyThresh)
|
||||
.setUnclipRatio(unclipRatio)
|
||||
.setMaxCandidates(maxCandidates);
|
||||
|
||||
CV_Assert(!recModelPath.empty());
|
||||
TextRecognitionModel recognizer(recModelPath);
|
||||
|
||||
// Load vocabulary
|
||||
CV_Assert(!vocPath.empty());
|
||||
std::ifstream vocFile;
|
||||
vocFile.open(samples::findFile(vocPath));
|
||||
CV_Assert(vocFile.is_open());
|
||||
String vocLine;
|
||||
std::vector<String> vocabulary;
|
||||
while (std::getline(vocFile, vocLine)) {
|
||||
vocabulary.push_back(vocLine);
|
||||
}
|
||||
recognizer.setVocabulary(vocabulary);
|
||||
recognizer.setDecodeType("CTC-greedy");
|
||||
|
||||
// Parameters for Detection
|
||||
double detScale = 1.0 / 255.0;
|
||||
Size detInputSize = Size(width, height);
|
||||
Scalar detMean = Scalar(122.67891434, 116.66876762, 104.00698793);
|
||||
detector.setInputParams(detScale, detInputSize, detMean);
|
||||
|
||||
// Parameters for Recognition
|
||||
double recScale = 1.0 / 127.5;
|
||||
Scalar recMean = Scalar(127.5);
|
||||
Size recInputSize = Size(100, 32);
|
||||
recognizer.setInputParams(recScale, recInputSize, recMean);
|
||||
|
||||
// Create a window
|
||||
static const std::string winName = "Text_Spotting";
|
||||
|
||||
// Input data
|
||||
Mat frame = imread(samples::findFile(parser.get<String>("inputImage")));
|
||||
std::cout << frame.size << std::endl;
|
||||
|
||||
// Inference
|
||||
std::vector< std::vector<Point> > detResults;
|
||||
detector.detect(frame, detResults);
|
||||
|
||||
if (detResults.size() > 0) {
|
||||
// Text Recognition
|
||||
Mat recInput;
|
||||
if (!imreadRGB) {
|
||||
cvtColor(frame, recInput, cv::COLOR_BGR2GRAY);
|
||||
} else {
|
||||
recInput = frame;
|
||||
}
|
||||
std::vector< std::vector<Point> > contours;
|
||||
for (uint i = 0; i < detResults.size(); i++)
|
||||
{
|
||||
const auto& quadrangle = detResults[i];
|
||||
CV_CheckEQ(quadrangle.size(), (size_t)4, "");
|
||||
|
||||
contours.emplace_back(quadrangle);
|
||||
|
||||
std::vector<Point2f> quadrangle_2f;
|
||||
for (int j = 0; j < 4; j++)
|
||||
quadrangle_2f.emplace_back(quadrangle[j]);
|
||||
|
||||
// Transform and Crop
|
||||
Mat cropped;
|
||||
fourPointsTransform(recInput, &quadrangle_2f[0], cropped);
|
||||
|
||||
std::string recognitionResult = recognizer.recognize(cropped);
|
||||
std::cout << i << ": '" << recognitionResult << "'" << std::endl;
|
||||
|
||||
putText(frame, recognitionResult, quadrangle[3], FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2);
|
||||
}
|
||||
polylines(frame, contours, true, Scalar(0, 255, 0), 2);
|
||||
} else {
|
||||
std::cout << "No Text Detected." << std::endl;
|
||||
}
|
||||
imshow(winName, frame);
|
||||
waitKey();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result)
|
||||
{
|
||||
const Size outputSize = Size(100, 32);
|
||||
|
||||
Point2f targetVertices[4] = {
|
||||
Point(0, outputSize.height - 1),
|
||||
Point(0, 0),
|
||||
Point(outputSize.width - 1, 0),
|
||||
Point(outputSize.width - 1, outputSize.height - 1)
|
||||
};
|
||||
Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices);
|
||||
|
||||
warpPerspective(frame, result, rotationMatrix, outputSize);
|
||||
|
||||
#if 0
|
||||
imshow("roi", result);
|
||||
waitKey();
|
||||
#endif
|
||||
}
|
||||
|
||||
bool sortPts(const Point& p1, const Point& p2)
|
||||
{
|
||||
return p1.x < p2.x;
|
||||
}
|
||||
247
samples/dnn/segmentation.cpp
Normal file
247
samples/dnn/segmentation.cpp
Normal file
@@ -0,0 +1,247 @@
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
std::string keys =
|
||||
"{ help h | | Print help message. }"
|
||||
"{ @alias | | An alias name of model to extract preprocessing parameters from models.yml file. }"
|
||||
"{ zoo | models.yml | An optional path to file with preprocessing parameters }"
|
||||
"{ device | 0 | camera device number. }"
|
||||
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera. }"
|
||||
"{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
|
||||
"{ classes | | Optional path to a text file with names of classes. }"
|
||||
"{ colors | | Optional path to a text file with colors for an every class. "
|
||||
"An every color is represented with three values from 0 to 255 in BGR channels order. }"
|
||||
"{ backend | 0 | Choose one of computation backends: "
|
||||
"0: automatically (by default), "
|
||||
"1: Halide language (http://halide-lang.org/), "
|
||||
"2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"3: OpenCV implementation }"
|
||||
"{ target | 0 | Choose one of target computation devices: "
|
||||
"0: CPU target (by default), "
|
||||
"1: OpenCL, "
|
||||
"2: OpenCL fp16 (half-float precision), "
|
||||
"3: VPU }";
|
||||
|
||||
using namespace cv;
|
||||
using namespace dnn;
|
||||
|
||||
std::vector<std::string> classes;
|
||||
std::vector<Vec3b> colors;
|
||||
|
||||
void showLegend();
|
||||
|
||||
void colorizeSegmentation(const Mat &score, Mat &segm);
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
|
||||
const std::string modelName = parser.get<String>("@alias");
|
||||
const std::string zooFile = parser.get<String>("zoo");
|
||||
|
||||
keys += genPreprocArguments(modelName, zooFile);
|
||||
|
||||
parser = CommandLineParser(argc, argv, keys);
|
||||
parser.about("Use this script to run semantic segmentation deep learning networks using OpenCV.");
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
float scale = parser.get<float>("scale");
|
||||
Scalar mean = parser.get<Scalar>("mean");
|
||||
bool swapRB = parser.get<bool>("rgb");
|
||||
int inpWidth = parser.get<int>("width");
|
||||
int inpHeight = parser.get<int>("height");
|
||||
String model = findFile(parser.get<String>("model"));
|
||||
String config = findFile(parser.get<String>("config"));
|
||||
String framework = parser.get<String>("framework");
|
||||
int backendId = parser.get<int>("backend");
|
||||
int targetId = parser.get<int>("target");
|
||||
|
||||
// Open file with classes names.
|
||||
if (parser.has("classes"))
|
||||
{
|
||||
std::string file = parser.get<String>("classes");
|
||||
std::ifstream ifs(file.c_str());
|
||||
if (!ifs.is_open())
|
||||
CV_Error(Error::StsError, "File " + file + " not found");
|
||||
std::string line;
|
||||
while (std::getline(ifs, line))
|
||||
{
|
||||
classes.push_back(line);
|
||||
}
|
||||
}
|
||||
|
||||
// Open file with colors.
|
||||
if (parser.has("colors"))
|
||||
{
|
||||
std::string file = parser.get<String>("colors");
|
||||
std::ifstream ifs(file.c_str());
|
||||
if (!ifs.is_open())
|
||||
CV_Error(Error::StsError, "File " + file + " not found");
|
||||
std::string line;
|
||||
while (std::getline(ifs, line))
|
||||
{
|
||||
std::istringstream colorStr(line.c_str());
|
||||
|
||||
Vec3b color;
|
||||
for (int i = 0; i < 3 && !colorStr.eof(); ++i)
|
||||
colorStr >> color[i];
|
||||
colors.push_back(color);
|
||||
}
|
||||
}
|
||||
|
||||
if (!parser.check())
|
||||
{
|
||||
parser.printErrors();
|
||||
return 1;
|
||||
}
|
||||
|
||||
CV_Assert(!model.empty());
|
||||
//! [Read and initialize network]
|
||||
Net net = readNet(model, config, framework);
|
||||
net.setPreferableBackend(backendId);
|
||||
net.setPreferableTarget(targetId);
|
||||
//! [Read and initialize network]
|
||||
|
||||
// Create a window
|
||||
static const std::string kWinName = "Deep learning semantic segmentation in OpenCV";
|
||||
namedWindow(kWinName, WINDOW_NORMAL);
|
||||
|
||||
//! [Open a video file or an image file or a camera stream]
|
||||
VideoCapture cap;
|
||||
if (parser.has("input"))
|
||||
cap.open(parser.get<String>("input"));
|
||||
else
|
||||
cap.open(parser.get<int>("device"));
|
||||
//! [Open a video file or an image file or a camera stream]
|
||||
|
||||
// Process frames.
|
||||
Mat frame, blob;
|
||||
while (waitKey(1) < 0)
|
||||
{
|
||||
cap >> frame;
|
||||
if (frame.empty())
|
||||
{
|
||||
waitKey();
|
||||
break;
|
||||
}
|
||||
|
||||
//! [Create a 4D blob from a frame]
|
||||
blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, false);
|
||||
//! [Create a 4D blob from a frame]
|
||||
|
||||
//! [Set input blob]
|
||||
net.setInput(blob);
|
||||
//! [Set input blob]
|
||||
//! [Make forward pass]
|
||||
Mat score = net.forward();
|
||||
//! [Make forward pass]
|
||||
|
||||
Mat segm;
|
||||
colorizeSegmentation(score, segm);
|
||||
|
||||
resize(segm, segm, frame.size(), 0, 0, INTER_NEAREST);
|
||||
addWeighted(frame, 0.1, segm, 0.9, 0.0, frame);
|
||||
|
||||
// Put efficiency information.
|
||||
std::vector<double> layersTimes;
|
||||
double freq = getTickFrequency() / 1000;
|
||||
double t = net.getPerfProfile(layersTimes) / freq;
|
||||
std::string label = format("Inference time: %.2f ms", t);
|
||||
putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
|
||||
imshow(kWinName, frame);
|
||||
if (!classes.empty())
|
||||
showLegend();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void colorizeSegmentation(const Mat &score, Mat &segm)
|
||||
{
|
||||
const int rows = score.size[2];
|
||||
const int cols = score.size[3];
|
||||
const int chns = score.size[1];
|
||||
|
||||
if (colors.empty())
|
||||
{
|
||||
// Generate colors.
|
||||
colors.push_back(Vec3b());
|
||||
for (int i = 1; i < chns; ++i)
|
||||
{
|
||||
Vec3b color;
|
||||
for (int j = 0; j < 3; ++j)
|
||||
color[j] = (colors[i - 1][j] + rand() % 256) / 2;
|
||||
colors.push_back(color);
|
||||
}
|
||||
}
|
||||
else if (chns != (int)colors.size())
|
||||
{
|
||||
CV_Error(Error::StsError, format("Number of output classes does not match "
|
||||
"number of colors (%d != %zu)", chns, colors.size()));
|
||||
}
|
||||
|
||||
Mat maxCl = Mat::zeros(rows, cols, CV_8UC1);
|
||||
Mat maxVal(rows, cols, CV_32FC1, score.data);
|
||||
for (int ch = 1; ch < chns; ch++)
|
||||
{
|
||||
for (int row = 0; row < rows; row++)
|
||||
{
|
||||
const float *ptrScore = score.ptr<float>(0, ch, row);
|
||||
uint8_t *ptrMaxCl = maxCl.ptr<uint8_t>(row);
|
||||
float *ptrMaxVal = maxVal.ptr<float>(row);
|
||||
for (int col = 0; col < cols; col++)
|
||||
{
|
||||
if (ptrScore[col] > ptrMaxVal[col])
|
||||
{
|
||||
ptrMaxVal[col] = ptrScore[col];
|
||||
ptrMaxCl[col] = (uchar)ch;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
segm.create(rows, cols, CV_8UC3);
|
||||
for (int row = 0; row < rows; row++)
|
||||
{
|
||||
const uchar *ptrMaxCl = maxCl.ptr<uchar>(row);
|
||||
Vec3b *ptrSegm = segm.ptr<Vec3b>(row);
|
||||
for (int col = 0; col < cols; col++)
|
||||
{
|
||||
ptrSegm[col] = colors[ptrMaxCl[col]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void showLegend()
|
||||
{
|
||||
static const int kBlockHeight = 30;
|
||||
static Mat legend;
|
||||
if (legend.empty())
|
||||
{
|
||||
const int numClasses = (int)classes.size();
|
||||
if ((int)colors.size() != numClasses)
|
||||
{
|
||||
CV_Error(Error::StsError, format("Number of output classes does not match "
|
||||
"number of labels (%zu != %zu)", colors.size(), classes.size()));
|
||||
}
|
||||
legend.create(kBlockHeight * numClasses, 200, CV_8UC3);
|
||||
for (int i = 0; i < numClasses; i++)
|
||||
{
|
||||
Mat block = legend.rowRange(i * kBlockHeight, (i + 1) * kBlockHeight);
|
||||
block.setTo(colors[i]);
|
||||
putText(block, classes[i], Point(0, kBlockHeight / 2), FONT_HERSHEY_SIMPLEX, 0.5, Vec3b(255, 255, 255));
|
||||
}
|
||||
namedWindow("Legend", WINDOW_NORMAL);
|
||||
imshow("Legend", legend);
|
||||
}
|
||||
}
|
||||
128
samples/dnn/segmentation.py
Normal file
128
samples/dnn/segmentation.py
Normal file
@@ -0,0 +1,128 @@
|
||||
import cv2 as cv
|
||||
import argparse
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
from common import *
|
||||
|
||||
backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV)
|
||||
targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL)
|
||||
|
||||
parser = argparse.ArgumentParser(add_help=False)
|
||||
parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
|
||||
help='An optional path to file with preprocessing parameters.')
|
||||
parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet'],
|
||||
help='Optional name of an origin framework of the model. '
|
||||
'Detect it automatically if it does not set.')
|
||||
parser.add_argument('--colors', help='Optional path to a text file with colors for an every class. '
|
||||
'An every color is represented with three values from 0 to 255 in BGR channels order.')
|
||||
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
|
||||
help="Choose one of computation backends: "
|
||||
"%d: automatically (by default), "
|
||||
"%d: Halide language (http://halide-lang.org/), "
|
||||
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"%d: OpenCV implementation" % backends)
|
||||
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
|
||||
help='Choose one of target computation devices: '
|
||||
'%d: CPU target (by default), '
|
||||
'%d: OpenCL, '
|
||||
'%d: OpenCL fp16 (half-float precision), '
|
||||
'%d: NCS2 VPU, '
|
||||
'%d: HDDL VPU' % targets)
|
||||
args, _ = parser.parse_known_args()
|
||||
add_preproc_args(args.zoo, parser, 'segmentation')
|
||||
parser = argparse.ArgumentParser(parents=[parser],
|
||||
description='Use this script to run semantic segmentation deep learning networks using OpenCV.',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
args = parser.parse_args()
|
||||
|
||||
args.model = findFile(args.model)
|
||||
args.config = findFile(args.config)
|
||||
args.classes = findFile(args.classes)
|
||||
|
||||
np.random.seed(324)
|
||||
|
||||
# Load names of classes
|
||||
classes = None
|
||||
if args.classes:
|
||||
with open(args.classes, 'rt') as f:
|
||||
classes = f.read().rstrip('\n').split('\n')
|
||||
|
||||
# Load colors
|
||||
colors = None
|
||||
if args.colors:
|
||||
with open(args.colors, 'rt') as f:
|
||||
colors = [np.array(color.split(' '), np.uint8) for color in f.read().rstrip('\n').split('\n')]
|
||||
|
||||
legend = None
|
||||
def showLegend(classes):
|
||||
global legend
|
||||
if not classes is None and legend is None:
|
||||
blockHeight = 30
|
||||
assert(len(classes) == len(colors))
|
||||
|
||||
legend = np.zeros((blockHeight * len(colors), 200, 3), np.uint8)
|
||||
for i in range(len(classes)):
|
||||
block = legend[i * blockHeight:(i + 1) * blockHeight]
|
||||
block[:,:] = colors[i]
|
||||
cv.putText(block, classes[i], (0, blockHeight//2), cv.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
|
||||
|
||||
cv.namedWindow('Legend', cv.WINDOW_NORMAL)
|
||||
cv.imshow('Legend', legend)
|
||||
classes = None
|
||||
|
||||
# Load a network
|
||||
net = cv.dnn.readNet(args.model, args.config, args.framework)
|
||||
net.setPreferableBackend(args.backend)
|
||||
net.setPreferableTarget(args.target)
|
||||
|
||||
winName = 'Deep learning semantic segmentation in OpenCV'
|
||||
cv.namedWindow(winName, cv.WINDOW_NORMAL)
|
||||
|
||||
cap = cv.VideoCapture(args.input if args.input else 0)
|
||||
legend = None
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
cv.waitKey()
|
||||
break
|
||||
|
||||
frameHeight = frame.shape[0]
|
||||
frameWidth = frame.shape[1]
|
||||
|
||||
# Create a 4D blob from a frame.
|
||||
inpWidth = args.width if args.width else frameWidth
|
||||
inpHeight = args.height if args.height else frameHeight
|
||||
blob = cv.dnn.blobFromImage(frame, args.scale, (inpWidth, inpHeight), args.mean, args.rgb, crop=False)
|
||||
|
||||
# Run a model
|
||||
net.setInput(blob)
|
||||
score = net.forward()
|
||||
|
||||
numClasses = score.shape[1]
|
||||
height = score.shape[2]
|
||||
width = score.shape[3]
|
||||
|
||||
# Draw segmentation
|
||||
if not colors:
|
||||
# Generate colors
|
||||
colors = [np.array([0, 0, 0], np.uint8)]
|
||||
for i in range(1, numClasses):
|
||||
colors.append((colors[i - 1] + np.random.randint(0, 256, [3], np.uint8)) / 2)
|
||||
|
||||
classIds = np.argmax(score[0], axis=0)
|
||||
segm = np.stack([colors[idx] for idx in classIds.flatten()])
|
||||
segm = segm.reshape(height, width, 3)
|
||||
|
||||
segm = cv.resize(segm, (frameWidth, frameHeight), interpolation=cv.INTER_NEAREST)
|
||||
frame = (0.1 * frame + 0.9 * segm).astype(np.uint8)
|
||||
|
||||
# Put efficiency information.
|
||||
t, _ = net.getPerfProfile()
|
||||
label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
|
||||
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
showLegend(classes)
|
||||
|
||||
cv.imshow(winName, frame)
|
||||
62
samples/dnn/shrink_tf_graph_weights.py
Normal file
62
samples/dnn/shrink_tf_graph_weights.py
Normal file
@@ -0,0 +1,62 @@
|
||||
# This file is part of OpenCV project.
|
||||
# It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
# of this distribution and at http://opencv.org/license.html.
|
||||
#
|
||||
# Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
# Third party copyrights are property of their respective owners.
|
||||
import tensorflow as tf
|
||||
import struct
|
||||
import argparse
|
||||
import numpy as np
|
||||
|
||||
parser = argparse.ArgumentParser(description='Convert weights of a frozen TensorFlow graph to fp16.')
|
||||
parser.add_argument('--input', required=True, help='Path to frozen graph.')
|
||||
parser.add_argument('--output', required=True, help='Path to output graph.')
|
||||
parser.add_argument('--ops', default=['Conv2D', 'MatMul'], nargs='+',
|
||||
help='List of ops which weights are converted.')
|
||||
args = parser.parse_args()
|
||||
|
||||
DT_FLOAT = 1
|
||||
DT_HALF = 19
|
||||
|
||||
# For the frozen graphs, an every node that uses weights connected to Const nodes
|
||||
# through an Identity node. Usually they're called in the same way with '/read' suffix.
|
||||
# We'll replace all of them to Cast nodes.
|
||||
|
||||
# Load the model
|
||||
with tf.gfile.FastGFile(args.input) as f:
|
||||
graph_def = tf.GraphDef()
|
||||
graph_def.ParseFromString(f.read())
|
||||
|
||||
# Set of all inputs from desired nodes.
|
||||
inputs = []
|
||||
for node in graph_def.node:
|
||||
if node.op in args.ops:
|
||||
inputs += node.input
|
||||
|
||||
weightsNodes = []
|
||||
for node in graph_def.node:
|
||||
# From the whole inputs we need to keep only an Identity nodes.
|
||||
if node.name in inputs and node.op == 'Identity' and node.attr['T'].type == DT_FLOAT:
|
||||
weightsNodes.append(node.input[0])
|
||||
|
||||
# Replace Identity to Cast.
|
||||
node.op = 'Cast'
|
||||
node.attr['DstT'].type = DT_FLOAT
|
||||
node.attr['SrcT'].type = DT_HALF
|
||||
del node.attr['T']
|
||||
del node.attr['_class']
|
||||
|
||||
# Convert weights to halfs.
|
||||
for node in graph_def.node:
|
||||
if node.name in weightsNodes:
|
||||
node.attr['dtype'].type = DT_HALF
|
||||
node.attr['value'].tensor.dtype = DT_HALF
|
||||
|
||||
floats = node.attr['value'].tensor.tensor_content
|
||||
|
||||
floats = struct.unpack('f' * (len(floats) / 4), floats)
|
||||
halfs = np.array(floats).astype(np.float16).view(np.uint16)
|
||||
node.attr['value'].tensor.tensor_content = struct.pack('H' * len(halfs), *halfs)
|
||||
|
||||
tf.train.write_graph(graph_def, "", args.output, as_text=False)
|
||||
397
samples/dnn/siamrpnpp.py
Normal file
397
samples/dnn/siamrpnpp.py
Normal file
@@ -0,0 +1,397 @@
|
||||
import argparse
|
||||
import cv2 as cv
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
"""
|
||||
Link to original paper : https://arxiv.org/abs/1812.11703
|
||||
Link to original repo : https://github.com/STVIR/pysot
|
||||
|
||||
You can download the pre-trained weights of the Tracker Model from https://drive.google.com/file/d/11bwgPFVkps9AH2NOD1zBDdpF_tQghAB-/view?usp=sharing
|
||||
You can download the target net (target branch of SiamRPN++) from https://drive.google.com/file/d/1dw_Ne3UMcCnFsaD6xkZepwE4GEpqq7U_/view?usp=sharing
|
||||
You can download the search net (search branch of SiamRPN++) from https://drive.google.com/file/d/1Lt4oE43ZSucJvze3Y-Z87CVDreO-Afwl/view?usp=sharing
|
||||
You can download the head model (RPN Head) from https://drive.google.com/file/d/1zT1yu12mtj3JQEkkfKFJWiZ71fJ-dQTi/view?usp=sharing
|
||||
"""
|
||||
|
||||
class ModelBuilder():
|
||||
""" This class generates the SiamRPN++ Tracker Model by using Imported ONNX Nets
|
||||
"""
|
||||
def __init__(self, target_net, search_net, rpn_head):
|
||||
super(ModelBuilder, self).__init__()
|
||||
# Build the target branch
|
||||
self.target_net = target_net
|
||||
# Build the search branch
|
||||
self.search_net = search_net
|
||||
# Build RPN_Head
|
||||
self.rpn_head = rpn_head
|
||||
|
||||
def template(self, z):
|
||||
""" Takes the template of size (1, 1, 127, 127) as an input to generate kernel
|
||||
"""
|
||||
self.target_net.setInput(z)
|
||||
outNames = self.target_net.getUnconnectedOutLayersNames()
|
||||
self.zfs_1, self.zfs_2, self.zfs_3 = self.target_net.forward(outNames)
|
||||
|
||||
def track(self, x):
|
||||
""" Takes the search of size (1, 1, 255, 255) as an input to generate classification score and bounding box regression
|
||||
"""
|
||||
self.search_net.setInput(x)
|
||||
outNames = self.search_net.getUnconnectedOutLayersNames()
|
||||
xfs_1, xfs_2, xfs_3 = self.search_net.forward(outNames)
|
||||
self.rpn_head.setInput(np.stack([self.zfs_1, self.zfs_2, self.zfs_3]), 'input_1')
|
||||
self.rpn_head.setInput(np.stack([xfs_1, xfs_2, xfs_3]), 'input_2')
|
||||
outNames = self.rpn_head.getUnconnectedOutLayersNames()
|
||||
cls, loc = self.rpn_head.forward(outNames)
|
||||
return {'cls': cls, 'loc': loc}
|
||||
|
||||
class Anchors:
|
||||
""" This class generate anchors.
|
||||
"""
|
||||
def __init__(self, stride, ratios, scales, image_center=0, size=0):
|
||||
self.stride = stride
|
||||
self.ratios = ratios
|
||||
self.scales = scales
|
||||
self.image_center = image_center
|
||||
self.size = size
|
||||
self.anchor_num = len(self.scales) * len(self.ratios)
|
||||
self.anchors = self.generate_anchors()
|
||||
|
||||
def generate_anchors(self):
|
||||
"""
|
||||
generate anchors based on predefined configuration
|
||||
"""
|
||||
anchors = np.zeros((self.anchor_num, 4), dtype=np.float32)
|
||||
size = self.stride**2
|
||||
count = 0
|
||||
for r in self.ratios:
|
||||
ws = int(np.sqrt(size * 1. / r))
|
||||
hs = int(ws * r)
|
||||
|
||||
for s in self.scales:
|
||||
w = ws * s
|
||||
h = hs * s
|
||||
anchors[count][:] = [-w * 0.5, -h * 0.5, w * 0.5, h * 0.5][:]
|
||||
count += 1
|
||||
return anchors
|
||||
|
||||
class SiamRPNTracker:
|
||||
def __init__(self, model):
|
||||
super(SiamRPNTracker, self).__init__()
|
||||
self.anchor_stride = 8
|
||||
self.anchor_ratios = [0.33, 0.5, 1, 2, 3]
|
||||
self.anchor_scales = [8]
|
||||
self.track_base_size = 8
|
||||
self.track_context_amount = 0.5
|
||||
self.track_exemplar_size = 127
|
||||
self.track_instance_size = 255
|
||||
self.track_lr = 0.4
|
||||
self.track_penalty_k = 0.04
|
||||
self.track_window_influence = 0.44
|
||||
self.score_size = (self.track_instance_size - self.track_exemplar_size) // \
|
||||
self.anchor_stride + 1 + self.track_base_size
|
||||
self.anchor_num = len(self.anchor_ratios) * len(self.anchor_scales)
|
||||
hanning = np.hanning(self.score_size)
|
||||
window = np.outer(hanning, hanning)
|
||||
self.window = np.tile(window.flatten(), self.anchor_num)
|
||||
self.anchors = self.generate_anchor(self.score_size)
|
||||
self.model = model
|
||||
|
||||
def get_subwindow(self, im, pos, model_sz, original_sz, avg_chans):
|
||||
"""
|
||||
Args:
|
||||
im: bgr based input image frame
|
||||
pos: position of the center of the frame
|
||||
model_sz: exemplar / target image size
|
||||
s_z: original / search image size
|
||||
avg_chans: channel average
|
||||
Return:
|
||||
im_patch: sub_windows for the given image input
|
||||
"""
|
||||
if isinstance(pos, float):
|
||||
pos = [pos, pos]
|
||||
sz = original_sz
|
||||
im_h, im_w, im_d = im.shape
|
||||
c = (original_sz + 1) / 2
|
||||
cx, cy = pos
|
||||
context_xmin = np.floor(cx - c + 0.5)
|
||||
context_xmax = context_xmin + sz - 1
|
||||
context_ymin = np.floor(cy - c + 0.5)
|
||||
context_ymax = context_ymin + sz - 1
|
||||
left_pad = int(max(0., -context_xmin))
|
||||
top_pad = int(max(0., -context_ymin))
|
||||
right_pad = int(max(0., context_xmax - im_w + 1))
|
||||
bottom_pad = int(max(0., context_ymax - im_h + 1))
|
||||
context_xmin += left_pad
|
||||
context_xmax += left_pad
|
||||
context_ymin += top_pad
|
||||
context_ymax += top_pad
|
||||
|
||||
if any([top_pad, bottom_pad, left_pad, right_pad]):
|
||||
size = (im_h + top_pad + bottom_pad, im_w + left_pad + right_pad, im_d)
|
||||
te_im = np.zeros(size, np.uint8)
|
||||
te_im[top_pad:top_pad + im_h, left_pad:left_pad + im_w, :] = im
|
||||
if top_pad:
|
||||
te_im[0:top_pad, left_pad:left_pad + im_w, :] = avg_chans
|
||||
if bottom_pad:
|
||||
te_im[im_h + top_pad:, left_pad:left_pad + im_w, :] = avg_chans
|
||||
if left_pad:
|
||||
te_im[:, 0:left_pad, :] = avg_chans
|
||||
if right_pad:
|
||||
te_im[:, im_w + left_pad:, :] = avg_chans
|
||||
im_patch = te_im[int(context_ymin):int(context_ymax + 1),
|
||||
int(context_xmin):int(context_xmax + 1), :]
|
||||
else:
|
||||
im_patch = im[int(context_ymin):int(context_ymax + 1),
|
||||
int(context_xmin):int(context_xmax + 1), :]
|
||||
|
||||
if not np.array_equal(model_sz, original_sz):
|
||||
im_patch = cv.resize(im_patch, (model_sz, model_sz))
|
||||
im_patch = im_patch.transpose(2, 0, 1)
|
||||
im_patch = im_patch[np.newaxis, :, :, :]
|
||||
im_patch = im_patch.astype(np.float32)
|
||||
return im_patch
|
||||
|
||||
def generate_anchor(self, score_size):
|
||||
"""
|
||||
Args:
|
||||
im: bgr based input image frame
|
||||
pos: position of the center of the frame
|
||||
model_sz: exemplar / target image size
|
||||
s_z: original / search image size
|
||||
avg_chans: channel average
|
||||
Return:
|
||||
anchor: anchors for pre-determined values of stride, ratio, and scale
|
||||
"""
|
||||
anchors = Anchors(self.anchor_stride, self.anchor_ratios, self.anchor_scales)
|
||||
anchor = anchors.anchors
|
||||
x1, y1, x2, y2 = anchor[:, 0], anchor[:, 1], anchor[:, 2], anchor[:, 3]
|
||||
anchor = np.stack([(x1 + x2) * 0.5, (y1 + y2) * 0.5, x2 - x1, y2 - y1], 1)
|
||||
total_stride = anchors.stride
|
||||
anchor_num = anchors.anchor_num
|
||||
anchor = np.tile(anchor, score_size * score_size).reshape((-1, 4))
|
||||
ori = - (score_size // 2) * total_stride
|
||||
xx, yy = np.meshgrid([ori + total_stride * dx for dx in range(score_size)],
|
||||
[ori + total_stride * dy for dy in range(score_size)])
|
||||
xx, yy = np.tile(xx.flatten(), (anchor_num, 1)).flatten(), \
|
||||
np.tile(yy.flatten(), (anchor_num, 1)).flatten()
|
||||
anchor[:, 0], anchor[:, 1] = xx.astype(np.float32), yy.astype(np.float32)
|
||||
return anchor
|
||||
|
||||
def _convert_bbox(self, delta, anchor):
|
||||
"""
|
||||
Args:
|
||||
delta: localisation
|
||||
anchor: anchor of pre-determined anchor size
|
||||
Return:
|
||||
delta: prediction of bounding box
|
||||
"""
|
||||
delta_transpose = np.transpose(delta, (1, 2, 3, 0))
|
||||
delta_contig = np.ascontiguousarray(delta_transpose)
|
||||
delta = delta_contig.reshape(4, -1)
|
||||
delta[0, :] = delta[0, :] * anchor[:, 2] + anchor[:, 0]
|
||||
delta[1, :] = delta[1, :] * anchor[:, 3] + anchor[:, 1]
|
||||
delta[2, :] = np.exp(delta[2, :]) * anchor[:, 2]
|
||||
delta[3, :] = np.exp(delta[3, :]) * anchor[:, 3]
|
||||
return delta
|
||||
|
||||
def _softmax(self, x):
|
||||
"""
|
||||
Softmax in the direction of the depth of the layer
|
||||
"""
|
||||
x = x.astype(dtype=np.float32)
|
||||
x_max = x.max(axis=1)[:, np.newaxis]
|
||||
e_x = np.exp(x-x_max)
|
||||
div = np.sum(e_x, axis=1)[:, np.newaxis]
|
||||
y = e_x / div
|
||||
return y
|
||||
|
||||
def _convert_score(self, score):
|
||||
"""
|
||||
Args:
|
||||
cls: score
|
||||
Return:
|
||||
cls: score for cls
|
||||
"""
|
||||
score_transpose = np.transpose(score, (1, 2, 3, 0))
|
||||
score_con = np.ascontiguousarray(score_transpose)
|
||||
score_view = score_con.reshape(2, -1)
|
||||
score = np.transpose(score_view, (1, 0))
|
||||
score = self._softmax(score)
|
||||
return score[:,1]
|
||||
|
||||
def _bbox_clip(self, cx, cy, width, height, boundary):
|
||||
"""
|
||||
Adjusting the bounding box
|
||||
"""
|
||||
bbox_h, bbox_w = boundary
|
||||
cx = max(0, min(cx, bbox_w))
|
||||
cy = max(0, min(cy, bbox_h))
|
||||
width = max(10, min(width, bbox_w))
|
||||
height = max(10, min(height, bbox_h))
|
||||
return cx, cy, width, height
|
||||
|
||||
def init(self, img, bbox):
|
||||
"""
|
||||
Args:
|
||||
img(np.ndarray): bgr based input image frame
|
||||
bbox: (x, y, w, h): bounding box
|
||||
"""
|
||||
x, y, w, h = bbox
|
||||
self.center_pos = np.array([x + (w - 1) / 2, y + (h - 1) / 2])
|
||||
self.h = h
|
||||
self.w = w
|
||||
w_z = self.w + self.track_context_amount * np.add(h, w)
|
||||
h_z = self.h + self.track_context_amount * np.add(h, w)
|
||||
s_z = round(np.sqrt(w_z * h_z))
|
||||
self.channel_average = np.mean(img, axis=(0, 1))
|
||||
z_crop = self.get_subwindow(img, self.center_pos, self.track_exemplar_size, s_z, self.channel_average)
|
||||
self.model.template(z_crop)
|
||||
|
||||
def track(self, img):
|
||||
"""
|
||||
Args:
|
||||
img(np.ndarray): BGR image
|
||||
Return:
|
||||
bbox(list):[x, y, width, height]
|
||||
"""
|
||||
w_z = self.w + self.track_context_amount * np.add(self.w, self.h)
|
||||
h_z = self.h + self.track_context_amount * np.add(self.w, self.h)
|
||||
s_z = np.sqrt(w_z * h_z)
|
||||
scale_z = self.track_exemplar_size / s_z
|
||||
s_x = s_z * (self.track_instance_size / self.track_exemplar_size)
|
||||
x_crop = self.get_subwindow(img, self.center_pos, self.track_instance_size, round(s_x), self.channel_average)
|
||||
outputs = self.model.track(x_crop)
|
||||
score = self._convert_score(outputs['cls'])
|
||||
pred_bbox = self._convert_bbox(outputs['loc'], self.anchors)
|
||||
|
||||
def change(r):
|
||||
return np.maximum(r, 1. / r)
|
||||
|
||||
def sz(w, h):
|
||||
pad = (w + h) * 0.5
|
||||
return np.sqrt((w + pad) * (h + pad))
|
||||
|
||||
# scale penalty
|
||||
s_c = change(sz(pred_bbox[2, :], pred_bbox[3, :]) /
|
||||
(sz(self.w * scale_z, self.h * scale_z)))
|
||||
|
||||
# aspect ratio penalty
|
||||
r_c = change((self.w / self.h) /
|
||||
(pred_bbox[2, :] / pred_bbox[3, :]))
|
||||
penalty = np.exp(-(r_c * s_c - 1) * self.track_penalty_k)
|
||||
pscore = penalty * score
|
||||
|
||||
# window penalty
|
||||
pscore = pscore * (1 - self.track_window_influence) + \
|
||||
self.window * self.track_window_influence
|
||||
best_idx = np.argmax(pscore)
|
||||
bbox = pred_bbox[:, best_idx] / scale_z
|
||||
lr = penalty[best_idx] * score[best_idx] * self.track_lr
|
||||
|
||||
cpx, cpy = self.center_pos
|
||||
x,y,w,h = bbox
|
||||
cx = x + cpx
|
||||
cy = y + cpy
|
||||
|
||||
# smooth bbox
|
||||
width = self.w * (1 - lr) + w * lr
|
||||
height = self.h * (1 - lr) + h * lr
|
||||
|
||||
# clip boundary
|
||||
cx, cy, width, height = self._bbox_clip(cx, cy, width, height, img.shape[:2])
|
||||
|
||||
# udpate state
|
||||
self.center_pos = np.array([cx, cy])
|
||||
self.w = width
|
||||
self.h = height
|
||||
bbox = [cx - width / 2, cy - height / 2, width, height]
|
||||
best_score = score[best_idx]
|
||||
return {'bbox': bbox, 'best_score': best_score}
|
||||
|
||||
def get_frames(video_name):
|
||||
"""
|
||||
Args:
|
||||
Path to input video frame
|
||||
Return:
|
||||
Frame
|
||||
"""
|
||||
cap = cv.VideoCapture(video_name if video_name else 0)
|
||||
while True:
|
||||
ret, frame = cap.read()
|
||||
if ret:
|
||||
yield frame
|
||||
else:
|
||||
break
|
||||
|
||||
def main():
|
||||
""" Sample SiamRPN Tracker
|
||||
"""
|
||||
# Computation backends supported by layers
|
||||
backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV)
|
||||
# Target Devices for computation
|
||||
targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD)
|
||||
|
||||
parser = argparse.ArgumentParser(description='Use this script to run SiamRPN++ Visual Tracker',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--input_video', type=str, help='Path to input video file. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--target_net', type=str, default='target_net.onnx', help='Path to part of SiamRPN++ ran on target frame.')
|
||||
parser.add_argument('--search_net', type=str, default='search_net.onnx', help='Path to part of SiamRPN++ ran on search frame.')
|
||||
parser.add_argument('--rpn_head', type=str, default='rpn_head.onnx', help='Path to RPN Head ONNX model.')
|
||||
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
|
||||
help='Select a computation backend: '
|
||||
"%d: automatically (by default) "
|
||||
"%d: Halide"
|
||||
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit)"
|
||||
"%d: OpenCV Implementation" % backends)
|
||||
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
|
||||
help='Select a target device: '
|
||||
"%d: CPU target (by default)"
|
||||
"%d: OpenCL"
|
||||
"%d: OpenCL FP16"
|
||||
"%d: Myriad" % targets)
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
if args.input_video and not os.path.isfile(args.input_video):
|
||||
raise OSError("Input video file does not exist")
|
||||
if not os.path.isfile(args.target_net):
|
||||
raise OSError("Target Net does not exist")
|
||||
if not os.path.isfile(args.search_net):
|
||||
raise OSError("Search Net does not exist")
|
||||
if not os.path.isfile(args.rpn_head):
|
||||
raise OSError("RPN Head Net does not exist")
|
||||
|
||||
#Load the Networks
|
||||
target_net = cv.dnn.readNetFromONNX(args.target_net)
|
||||
target_net.setPreferableBackend(args.backend)
|
||||
target_net.setPreferableTarget(args.target)
|
||||
search_net = cv.dnn.readNetFromONNX(args.search_net)
|
||||
search_net.setPreferableBackend(args.backend)
|
||||
search_net.setPreferableTarget(args.target)
|
||||
rpn_head = cv.dnn.readNetFromONNX(args.rpn_head)
|
||||
rpn_head.setPreferableBackend(args.backend)
|
||||
rpn_head.setPreferableTarget(args.target)
|
||||
model = ModelBuilder(target_net, search_net, rpn_head)
|
||||
tracker = SiamRPNTracker(model)
|
||||
|
||||
first_frame = True
|
||||
cv.namedWindow('SiamRPN++ Tracker', cv.WINDOW_AUTOSIZE)
|
||||
for frame in get_frames(args.input_video):
|
||||
if first_frame:
|
||||
try:
|
||||
init_rect = cv.selectROI('SiamRPN++ Tracker', frame, False, False)
|
||||
except:
|
||||
exit()
|
||||
tracker.init(frame, init_rect)
|
||||
first_frame = False
|
||||
else:
|
||||
outputs = tracker.track(frame)
|
||||
bbox = list(map(int, outputs['bbox']))
|
||||
x,y,w,h = bbox
|
||||
cv.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 3)
|
||||
cv.imshow('SiamRPN++ Tracker', frame)
|
||||
key = cv.waitKey(1)
|
||||
if key == ord("q"):
|
||||
break
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
177
samples/dnn/text_detection.cpp
Normal file
177
samples/dnn/text_detection.cpp
Normal file
@@ -0,0 +1,177 @@
|
||||
/*
|
||||
Text detection model: https://github.com/argman/EAST
|
||||
Download link: https://www.dropbox.com/s/r2ingd0l3zt8hxs/frozen_east_text_detection.tar.gz?dl=1
|
||||
|
||||
Text recognition models can be downloaded directly here:
|
||||
Download link: https://drive.google.com/drive/folders/1cTbQ3nuZG-EKWak6emD_s8_hHXWz7lAr?usp=sharing
|
||||
and doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown
|
||||
|
||||
How to convert from pb to onnx:
|
||||
Using classes from here: https://github.com/meijieru/crnn.pytorch/blob/master/models/crnn.py
|
||||
import torch
|
||||
from models.crnn import CRNN
|
||||
model = CRNN(32, 1, 37, 256)
|
||||
model.load_state_dict(torch.load('crnn.pth'))
|
||||
dummy_input = torch.randn(1, 1, 32, 100)
|
||||
torch.onnx.export(model, dummy_input, "crnn.onnx", verbose=True)
|
||||
|
||||
For more information, please refer to doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown and doc/tutorials/dnn/dnn_OCR/dnn_OCR.markdown
|
||||
*/
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <opencv2/dnn.hpp>
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::dnn;
|
||||
|
||||
const char* keys =
|
||||
"{ help h | | Print help message. }"
|
||||
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
|
||||
"{ detModel dmp | | Path to a binary .pb file contains trained detector network.}"
|
||||
"{ width | 320 | Preprocess input image by resizing to a specific width. It should be multiple by 32. }"
|
||||
"{ height | 320 | Preprocess input image by resizing to a specific height. It should be multiple by 32. }"
|
||||
"{ thr | 0.5 | Confidence threshold. }"
|
||||
"{ nms | 0.4 | Non-maximum suppression threshold. }"
|
||||
"{ recModel rmp | | Path to a binary .onnx file contains trained CRNN text recognition model. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
|
||||
"{ RGBInput rgb |0| 0: imread with flags=IMREAD_GRAYSCALE; 1: imread with flags=IMREAD_COLOR. }"
|
||||
"{ vocabularyPath vp | alphabet_36.txt | Path to benchmarks for evaluation. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";
|
||||
|
||||
void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result);
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
// Parse command line arguments.
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
parser.about("Use this script to run TensorFlow implementation (https://github.com/argman/EAST) of "
|
||||
"EAST: An Efficient and Accurate Scene Text Detector (https://arxiv.org/abs/1704.03155v2)");
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
float confThreshold = parser.get<float>("thr");
|
||||
float nmsThreshold = parser.get<float>("nms");
|
||||
int width = parser.get<int>("width");
|
||||
int height = parser.get<int>("height");
|
||||
int imreadRGB = parser.get<int>("RGBInput");
|
||||
String detModelPath = parser.get<String>("detModel");
|
||||
String recModelPath = parser.get<String>("recModel");
|
||||
String vocPath = parser.get<String>("vocabularyPath");
|
||||
|
||||
if (!parser.check())
|
||||
{
|
||||
parser.printErrors();
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Load networks.
|
||||
CV_Assert(!detModelPath.empty() && !recModelPath.empty());
|
||||
TextDetectionModel_EAST detector(detModelPath);
|
||||
detector.setConfidenceThreshold(confThreshold)
|
||||
.setNMSThreshold(nmsThreshold);
|
||||
|
||||
TextRecognitionModel recognizer(recModelPath);
|
||||
|
||||
// Load vocabulary
|
||||
CV_Assert(!vocPath.empty());
|
||||
std::ifstream vocFile;
|
||||
vocFile.open(samples::findFile(vocPath));
|
||||
CV_Assert(vocFile.is_open());
|
||||
String vocLine;
|
||||
std::vector<String> vocabulary;
|
||||
while (std::getline(vocFile, vocLine)) {
|
||||
vocabulary.push_back(vocLine);
|
||||
}
|
||||
recognizer.setVocabulary(vocabulary);
|
||||
recognizer.setDecodeType("CTC-greedy");
|
||||
|
||||
// Parameters for Recognition
|
||||
double recScale = 1.0 / 127.5;
|
||||
Scalar recMean = Scalar(127.5, 127.5, 127.5);
|
||||
Size recInputSize = Size(100, 32);
|
||||
recognizer.setInputParams(recScale, recInputSize, recMean);
|
||||
|
||||
// Parameters for Detection
|
||||
double detScale = 1.0;
|
||||
Size detInputSize = Size(width, height);
|
||||
Scalar detMean = Scalar(123.68, 116.78, 103.94);
|
||||
bool swapRB = true;
|
||||
detector.setInputParams(detScale, detInputSize, detMean, swapRB);
|
||||
|
||||
// Open a video file or an image file or a camera stream.
|
||||
VideoCapture cap;
|
||||
bool openSuccess = parser.has("input") ? cap.open(parser.get<String>("input")) : cap.open(0);
|
||||
CV_Assert(openSuccess);
|
||||
|
||||
static const std::string kWinName = "EAST: An Efficient and Accurate Scene Text Detector";
|
||||
|
||||
Mat frame;
|
||||
while (waitKey(1) < 0)
|
||||
{
|
||||
cap >> frame;
|
||||
if (frame.empty())
|
||||
{
|
||||
waitKey();
|
||||
break;
|
||||
}
|
||||
|
||||
std::cout << frame.size << std::endl;
|
||||
|
||||
// Detection
|
||||
std::vector< std::vector<Point> > detResults;
|
||||
detector.detect(frame, detResults);
|
||||
|
||||
if (detResults.size() > 0) {
|
||||
// Text Recognition
|
||||
Mat recInput;
|
||||
if (!imreadRGB) {
|
||||
cvtColor(frame, recInput, cv::COLOR_BGR2GRAY);
|
||||
} else {
|
||||
recInput = frame;
|
||||
}
|
||||
std::vector< std::vector<Point> > contours;
|
||||
for (uint i = 0; i < detResults.size(); i++)
|
||||
{
|
||||
const auto& quadrangle = detResults[i];
|
||||
CV_CheckEQ(quadrangle.size(), (size_t)4, "");
|
||||
|
||||
contours.emplace_back(quadrangle);
|
||||
|
||||
std::vector<Point2f> quadrangle_2f;
|
||||
for (int j = 0; j < 4; j++)
|
||||
quadrangle_2f.emplace_back(quadrangle[j]);
|
||||
|
||||
Mat cropped;
|
||||
fourPointsTransform(recInput, &quadrangle_2f[0], cropped);
|
||||
|
||||
std::string recognitionResult = recognizer.recognize(cropped);
|
||||
std::cout << i << ": '" << recognitionResult << "'" << std::endl;
|
||||
|
||||
putText(frame, recognitionResult, quadrangle[3], FONT_HERSHEY_SIMPLEX, 1.5, Scalar(0, 0, 255), 2);
|
||||
}
|
||||
polylines(frame, contours, true, Scalar(0, 255, 0), 2);
|
||||
}
|
||||
imshow(kWinName, frame);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result)
|
||||
{
|
||||
const Size outputSize = Size(100, 32);
|
||||
|
||||
Point2f targetVertices[4] = {
|
||||
Point(0, outputSize.height - 1),
|
||||
Point(0, 0), Point(outputSize.width - 1, 0),
|
||||
Point(outputSize.width - 1, outputSize.height - 1)
|
||||
};
|
||||
Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices);
|
||||
|
||||
warpPerspective(frame, result, rotationMatrix, outputSize);
|
||||
}
|
||||
239
samples/dnn/text_detection.py
Normal file
239
samples/dnn/text_detection.py
Normal file
@@ -0,0 +1,239 @@
|
||||
'''
|
||||
Text detection model: https://github.com/argman/EAST
|
||||
Download link: https://www.dropbox.com/s/r2ingd0l3zt8hxs/frozen_east_text_detection.tar.gz?dl=1
|
||||
|
||||
CRNN Text recognition model taken from here: https://github.com/meijieru/crnn.pytorch
|
||||
How to convert from pb to onnx:
|
||||
Using classes from here: https://github.com/meijieru/crnn.pytorch/blob/master/models/crnn.py
|
||||
|
||||
More converted onnx text recognition models can be downloaded directly here:
|
||||
Download link: https://drive.google.com/drive/folders/1cTbQ3nuZG-EKWak6emD_s8_hHXWz7lAr?usp=sharing
|
||||
And these models taken from here:https://github.com/clovaai/deep-text-recognition-benchmark
|
||||
|
||||
import torch
|
||||
from models.crnn import CRNN
|
||||
|
||||
model = CRNN(32, 1, 37, 256)
|
||||
model.load_state_dict(torch.load('crnn.pth'))
|
||||
dummy_input = torch.randn(1, 1, 32, 100)
|
||||
torch.onnx.export(model, dummy_input, "crnn.onnx", verbose=True)
|
||||
'''
|
||||
|
||||
|
||||
# Import required modules
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
import math
|
||||
import argparse
|
||||
|
||||
############ Add argument parser for command line arguments ############
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Use this script to run TensorFlow implementation (https://github.com/argman/EAST) of "
|
||||
"EAST: An Efficient and Accurate Scene Text Detector (https://arxiv.org/abs/1704.03155v2)"
|
||||
"The OCR model can be obtained from converting the pretrained CRNN model to .onnx format from the github repository https://github.com/meijieru/crnn.pytorch"
|
||||
"Or you can download trained OCR model directly from https://drive.google.com/drive/folders/1cTbQ3nuZG-EKWak6emD_s8_hHXWz7lAr?usp=sharing")
|
||||
parser.add_argument('--input',
|
||||
help='Path to input image or video file. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--model', '-m', required=True,
|
||||
help='Path to a binary .pb file contains trained detector network.')
|
||||
parser.add_argument('--ocr', default="crnn.onnx",
|
||||
help="Path to a binary .pb or .onnx file contains trained recognition network", )
|
||||
parser.add_argument('--width', type=int, default=320,
|
||||
help='Preprocess input image by resizing to a specific width. It should be multiple by 32.')
|
||||
parser.add_argument('--height', type=int, default=320,
|
||||
help='Preprocess input image by resizing to a specific height. It should be multiple by 32.')
|
||||
parser.add_argument('--thr', type=float, default=0.5,
|
||||
help='Confidence threshold.')
|
||||
parser.add_argument('--nms', type=float, default=0.4,
|
||||
help='Non-maximum suppression threshold.')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
############ Utility functions ############
|
||||
|
||||
def fourPointsTransform(frame, vertices):
|
||||
vertices = np.asarray(vertices)
|
||||
outputSize = (100, 32)
|
||||
targetVertices = np.array([
|
||||
[0, outputSize[1] - 1],
|
||||
[0, 0],
|
||||
[outputSize[0] - 1, 0],
|
||||
[outputSize[0] - 1, outputSize[1] - 1]], dtype="float32")
|
||||
|
||||
rotationMatrix = cv.getPerspectiveTransform(vertices, targetVertices)
|
||||
result = cv.warpPerspective(frame, rotationMatrix, outputSize)
|
||||
return result
|
||||
|
||||
|
||||
def decodeText(scores):
|
||||
text = ""
|
||||
alphabet = "0123456789abcdefghijklmnopqrstuvwxyz"
|
||||
for i in range(scores.shape[0]):
|
||||
c = np.argmax(scores[i][0])
|
||||
if c != 0:
|
||||
text += alphabet[c - 1]
|
||||
else:
|
||||
text += '-'
|
||||
|
||||
# adjacent same letters as well as background text must be removed to get the final output
|
||||
char_list = []
|
||||
for i in range(len(text)):
|
||||
if text[i] != '-' and (not (i > 0 and text[i] == text[i - 1])):
|
||||
char_list.append(text[i])
|
||||
return ''.join(char_list)
|
||||
|
||||
|
||||
def decodeBoundingBoxes(scores, geometry, scoreThresh):
|
||||
detections = []
|
||||
confidences = []
|
||||
|
||||
############ CHECK DIMENSIONS AND SHAPES OF geometry AND scores ############
|
||||
assert len(scores.shape) == 4, "Incorrect dimensions of scores"
|
||||
assert len(geometry.shape) == 4, "Incorrect dimensions of geometry"
|
||||
assert scores.shape[0] == 1, "Invalid dimensions of scores"
|
||||
assert geometry.shape[0] == 1, "Invalid dimensions of geometry"
|
||||
assert scores.shape[1] == 1, "Invalid dimensions of scores"
|
||||
assert geometry.shape[1] == 5, "Invalid dimensions of geometry"
|
||||
assert scores.shape[2] == geometry.shape[2], "Invalid dimensions of scores and geometry"
|
||||
assert scores.shape[3] == geometry.shape[3], "Invalid dimensions of scores and geometry"
|
||||
height = scores.shape[2]
|
||||
width = scores.shape[3]
|
||||
for y in range(0, height):
|
||||
|
||||
# Extract data from scores
|
||||
scoresData = scores[0][0][y]
|
||||
x0_data = geometry[0][0][y]
|
||||
x1_data = geometry[0][1][y]
|
||||
x2_data = geometry[0][2][y]
|
||||
x3_data = geometry[0][3][y]
|
||||
anglesData = geometry[0][4][y]
|
||||
for x in range(0, width):
|
||||
score = scoresData[x]
|
||||
|
||||
# If score is lower than threshold score, move to next x
|
||||
if (score < scoreThresh):
|
||||
continue
|
||||
|
||||
# Calculate offset
|
||||
offsetX = x * 4.0
|
||||
offsetY = y * 4.0
|
||||
angle = anglesData[x]
|
||||
|
||||
# Calculate cos and sin of angle
|
||||
cosA = math.cos(angle)
|
||||
sinA = math.sin(angle)
|
||||
h = x0_data[x] + x2_data[x]
|
||||
w = x1_data[x] + x3_data[x]
|
||||
|
||||
# Calculate offset
|
||||
offset = ([offsetX + cosA * x1_data[x] + sinA * x2_data[x], offsetY - sinA * x1_data[x] + cosA * x2_data[x]])
|
||||
|
||||
# Find points for rectangle
|
||||
p1 = (-sinA * h + offset[0], -cosA * h + offset[1])
|
||||
p3 = (-cosA * w + offset[0], sinA * w + offset[1])
|
||||
center = (0.5 * (p1[0] + p3[0]), 0.5 * (p1[1] + p3[1]))
|
||||
detections.append((center, (w, h), -1 * angle * 180.0 / math.pi))
|
||||
confidences.append(float(score))
|
||||
|
||||
# Return detections and confidences
|
||||
return [detections, confidences]
|
||||
|
||||
|
||||
def main():
|
||||
# Read and store arguments
|
||||
confThreshold = args.thr
|
||||
nmsThreshold = args.nms
|
||||
inpWidth = args.width
|
||||
inpHeight = args.height
|
||||
modelDetector = args.model
|
||||
modelRecognition = args.ocr
|
||||
|
||||
# Load network
|
||||
detector = cv.dnn.readNet(modelDetector)
|
||||
recognizer = cv.dnn.readNet(modelRecognition)
|
||||
|
||||
# Create a new named window
|
||||
kWinName = "EAST: An Efficient and Accurate Scene Text Detector"
|
||||
cv.namedWindow(kWinName, cv.WINDOW_NORMAL)
|
||||
outNames = []
|
||||
outNames.append("feature_fusion/Conv_7/Sigmoid")
|
||||
outNames.append("feature_fusion/concat_3")
|
||||
|
||||
# Open a video file or an image file or a camera stream
|
||||
cap = cv.VideoCapture(args.input if args.input else 0)
|
||||
|
||||
tickmeter = cv.TickMeter()
|
||||
while cv.waitKey(1) < 0:
|
||||
# Read frame
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
cv.waitKey()
|
||||
break
|
||||
|
||||
# Get frame height and width
|
||||
height_ = frame.shape[0]
|
||||
width_ = frame.shape[1]
|
||||
rW = width_ / float(inpWidth)
|
||||
rH = height_ / float(inpHeight)
|
||||
|
||||
# Create a 4D blob from frame.
|
||||
blob = cv.dnn.blobFromImage(frame, 1.0, (inpWidth, inpHeight), (123.68, 116.78, 103.94), True, False)
|
||||
|
||||
# Run the detection model
|
||||
detector.setInput(blob)
|
||||
|
||||
tickmeter.start()
|
||||
outs = detector.forward(outNames)
|
||||
tickmeter.stop()
|
||||
|
||||
# Get scores and geometry
|
||||
scores = outs[0]
|
||||
geometry = outs[1]
|
||||
[boxes, confidences] = decodeBoundingBoxes(scores, geometry, confThreshold)
|
||||
|
||||
# Apply NMS
|
||||
indices = cv.dnn.NMSBoxesRotated(boxes, confidences, confThreshold, nmsThreshold)
|
||||
for i in indices:
|
||||
# get 4 corners of the rotated rect
|
||||
vertices = cv.boxPoints(boxes[i[0]])
|
||||
# scale the bounding box coordinates based on the respective ratios
|
||||
for j in range(4):
|
||||
vertices[j][0] *= rW
|
||||
vertices[j][1] *= rH
|
||||
|
||||
|
||||
# get cropped image using perspective transform
|
||||
if modelRecognition:
|
||||
cropped = fourPointsTransform(frame, vertices)
|
||||
cropped = cv.cvtColor(cropped, cv.COLOR_BGR2GRAY)
|
||||
|
||||
# Create a 4D blob from cropped image
|
||||
blob = cv.dnn.blobFromImage(cropped, size=(100, 32), mean=127.5, scalefactor=1 / 127.5)
|
||||
recognizer.setInput(blob)
|
||||
|
||||
# Run the recognition model
|
||||
tickmeter.start()
|
||||
result = recognizer.forward()
|
||||
tickmeter.stop()
|
||||
|
||||
# decode the result into text
|
||||
wordRecognized = decodeText(result)
|
||||
cv.putText(frame, wordRecognized, (int(vertices[1][0]), int(vertices[1][1])), cv.FONT_HERSHEY_SIMPLEX,
|
||||
0.5, (255, 0, 0))
|
||||
|
||||
for j in range(4):
|
||||
p1 = (vertices[j][0], vertices[j][1])
|
||||
p2 = (vertices[(j + 1) % 4][0], vertices[(j + 1) % 4][1])
|
||||
cv.line(frame, p1, p2, (0, 255, 0), 1)
|
||||
|
||||
# Put efficiency information
|
||||
label = 'Inference time: %.2f ms' % (tickmeter.getTimeMilli())
|
||||
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
# Display the frame
|
||||
cv.imshow(kWinName, frame)
|
||||
tickmeter.reset()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
333
samples/dnn/tf_text_graph_common.py
Normal file
333
samples/dnn/tf_text_graph_common.py
Normal file
@@ -0,0 +1,333 @@
|
||||
def tokenize(s):
|
||||
tokens = []
|
||||
token = ""
|
||||
isString = False
|
||||
isComment = False
|
||||
for symbol in s:
|
||||
isComment = (isComment and symbol != '\n') or (not isString and symbol == '#')
|
||||
if isComment:
|
||||
continue
|
||||
|
||||
if symbol == ' ' or symbol == '\t' or symbol == '\r' or symbol == '\'' or \
|
||||
symbol == '\n' or symbol == ':' or symbol == '\"' or symbol == ';' or \
|
||||
symbol == ',':
|
||||
|
||||
if (symbol == '\"' or symbol == '\'') and isString:
|
||||
tokens.append(token)
|
||||
token = ""
|
||||
else:
|
||||
if isString:
|
||||
token += symbol
|
||||
elif token:
|
||||
tokens.append(token)
|
||||
token = ""
|
||||
isString = (symbol == '\"' or symbol == '\'') ^ isString
|
||||
|
||||
elif symbol == '{' or symbol == '}' or symbol == '[' or symbol == ']':
|
||||
if token:
|
||||
tokens.append(token)
|
||||
token = ""
|
||||
tokens.append(symbol)
|
||||
else:
|
||||
token += symbol
|
||||
if token:
|
||||
tokens.append(token)
|
||||
return tokens
|
||||
|
||||
|
||||
def parseMessage(tokens, idx):
|
||||
msg = {}
|
||||
assert(tokens[idx] == '{')
|
||||
|
||||
isArray = False
|
||||
while True:
|
||||
if not isArray:
|
||||
idx += 1
|
||||
if idx < len(tokens):
|
||||
fieldName = tokens[idx]
|
||||
else:
|
||||
return None
|
||||
if fieldName == '}':
|
||||
break
|
||||
|
||||
idx += 1
|
||||
fieldValue = tokens[idx]
|
||||
|
||||
if fieldValue == '{':
|
||||
embeddedMsg, idx = parseMessage(tokens, idx)
|
||||
if fieldName in msg:
|
||||
msg[fieldName].append(embeddedMsg)
|
||||
else:
|
||||
msg[fieldName] = [embeddedMsg]
|
||||
elif fieldValue == '[':
|
||||
isArray = True
|
||||
elif fieldValue == ']':
|
||||
isArray = False
|
||||
else:
|
||||
if fieldName in msg:
|
||||
msg[fieldName].append(fieldValue)
|
||||
else:
|
||||
msg[fieldName] = [fieldValue]
|
||||
return msg, idx
|
||||
|
||||
|
||||
def readTextMessage(filePath):
|
||||
if not filePath:
|
||||
return {}
|
||||
with open(filePath, 'rt') as f:
|
||||
content = f.read()
|
||||
|
||||
tokens = tokenize('{' + content + '}')
|
||||
msg = parseMessage(tokens, 0)
|
||||
return msg[0] if msg else {}
|
||||
|
||||
|
||||
def listToTensor(values):
|
||||
if all([isinstance(v, float) for v in values]):
|
||||
dtype = 'DT_FLOAT'
|
||||
field = 'float_val'
|
||||
elif all([isinstance(v, int) for v in values]):
|
||||
dtype = 'DT_INT32'
|
||||
field = 'int_val'
|
||||
else:
|
||||
raise Exception('Wrong values types')
|
||||
|
||||
msg = {
|
||||
'tensor': {
|
||||
'dtype': dtype,
|
||||
'tensor_shape': {
|
||||
'dim': {
|
||||
'size': len(values)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
msg['tensor'][field] = values
|
||||
return msg
|
||||
|
||||
|
||||
def addConstNode(name, values, graph_def):
|
||||
node = NodeDef()
|
||||
node.name = name
|
||||
node.op = 'Const'
|
||||
node.addAttr('value', values)
|
||||
graph_def.node.extend([node])
|
||||
|
||||
|
||||
def addSlice(inp, out, begins, sizes, graph_def):
|
||||
beginsNode = NodeDef()
|
||||
beginsNode.name = out + '/begins'
|
||||
beginsNode.op = 'Const'
|
||||
beginsNode.addAttr('value', begins)
|
||||
graph_def.node.extend([beginsNode])
|
||||
|
||||
sizesNode = NodeDef()
|
||||
sizesNode.name = out + '/sizes'
|
||||
sizesNode.op = 'Const'
|
||||
sizesNode.addAttr('value', sizes)
|
||||
graph_def.node.extend([sizesNode])
|
||||
|
||||
sliced = NodeDef()
|
||||
sliced.name = out
|
||||
sliced.op = 'Slice'
|
||||
sliced.input.append(inp)
|
||||
sliced.input.append(beginsNode.name)
|
||||
sliced.input.append(sizesNode.name)
|
||||
graph_def.node.extend([sliced])
|
||||
|
||||
|
||||
def addReshape(inp, out, shape, graph_def):
|
||||
shapeNode = NodeDef()
|
||||
shapeNode.name = out + '/shape'
|
||||
shapeNode.op = 'Const'
|
||||
shapeNode.addAttr('value', shape)
|
||||
graph_def.node.extend([shapeNode])
|
||||
|
||||
reshape = NodeDef()
|
||||
reshape.name = out
|
||||
reshape.op = 'Reshape'
|
||||
reshape.input.append(inp)
|
||||
reshape.input.append(shapeNode.name)
|
||||
graph_def.node.extend([reshape])
|
||||
|
||||
|
||||
def addSoftMax(inp, out, graph_def):
|
||||
softmax = NodeDef()
|
||||
softmax.name = out
|
||||
softmax.op = 'Softmax'
|
||||
softmax.addAttr('axis', -1)
|
||||
softmax.input.append(inp)
|
||||
graph_def.node.extend([softmax])
|
||||
|
||||
|
||||
def addFlatten(inp, out, graph_def):
|
||||
flatten = NodeDef()
|
||||
flatten.name = out
|
||||
flatten.op = 'Flatten'
|
||||
flatten.input.append(inp)
|
||||
graph_def.node.extend([flatten])
|
||||
|
||||
|
||||
class NodeDef:
|
||||
def __init__(self):
|
||||
self.input = []
|
||||
self.name = ""
|
||||
self.op = ""
|
||||
self.attr = {}
|
||||
|
||||
def addAttr(self, key, value):
|
||||
assert(not key in self.attr)
|
||||
if isinstance(value, bool):
|
||||
self.attr[key] = {'b': value}
|
||||
elif isinstance(value, int):
|
||||
self.attr[key] = {'i': value}
|
||||
elif isinstance(value, float):
|
||||
self.attr[key] = {'f': value}
|
||||
elif isinstance(value, str):
|
||||
self.attr[key] = {'s': value}
|
||||
elif isinstance(value, list):
|
||||
self.attr[key] = listToTensor(value)
|
||||
else:
|
||||
raise Exception('Unknown type of attribute ' + key)
|
||||
|
||||
def Clear(self):
|
||||
self.input = []
|
||||
self.name = ""
|
||||
self.op = ""
|
||||
self.attr = {}
|
||||
|
||||
|
||||
class GraphDef:
|
||||
def __init__(self):
|
||||
self.node = []
|
||||
|
||||
def save(self, filePath):
|
||||
with open(filePath, 'wt') as f:
|
||||
|
||||
def printAttr(d, indent):
|
||||
indent = ' ' * indent
|
||||
for key, value in sorted(d.items(), key=lambda x:x[0].lower()):
|
||||
value = value if isinstance(value, list) else [value]
|
||||
for v in value:
|
||||
if isinstance(v, dict):
|
||||
f.write(indent + key + ' {\n')
|
||||
printAttr(v, len(indent) + 2)
|
||||
f.write(indent + '}\n')
|
||||
else:
|
||||
isString = False
|
||||
if isinstance(v, str) and not v.startswith('DT_'):
|
||||
try:
|
||||
float(v)
|
||||
except:
|
||||
isString = True
|
||||
|
||||
if isinstance(v, bool):
|
||||
printed = 'true' if v else 'false'
|
||||
elif v == 'true' or v == 'false':
|
||||
printed = 'true' if v == 'true' else 'false'
|
||||
elif isString:
|
||||
printed = '\"%s\"' % v
|
||||
else:
|
||||
printed = str(v)
|
||||
f.write(indent + key + ': ' + printed + '\n')
|
||||
|
||||
for node in self.node:
|
||||
f.write('node {\n')
|
||||
f.write(' name: \"%s\"\n' % node.name)
|
||||
f.write(' op: \"%s\"\n' % node.op)
|
||||
for inp in node.input:
|
||||
f.write(' input: \"%s\"\n' % inp)
|
||||
for key, value in sorted(node.attr.items(), key=lambda x:x[0].lower()):
|
||||
f.write(' attr {\n')
|
||||
f.write(' key: \"%s\"\n' % key)
|
||||
f.write(' value {\n')
|
||||
printAttr(value, 6)
|
||||
f.write(' }\n')
|
||||
f.write(' }\n')
|
||||
f.write('}\n')
|
||||
|
||||
|
||||
def parseTextGraph(filePath):
|
||||
msg = readTextMessage(filePath)
|
||||
|
||||
graph = GraphDef()
|
||||
for node in msg['node']:
|
||||
graphNode = NodeDef()
|
||||
graphNode.name = node['name'][0]
|
||||
graphNode.op = node['op'][0]
|
||||
graphNode.input = node['input'] if 'input' in node else []
|
||||
|
||||
if 'attr' in node:
|
||||
for attr in node['attr']:
|
||||
graphNode.attr[attr['key'][0]] = attr['value'][0]
|
||||
|
||||
graph.node.append(graphNode)
|
||||
return graph
|
||||
|
||||
|
||||
# Removes Identity nodes
|
||||
def removeIdentity(graph_def):
|
||||
identities = {}
|
||||
for node in graph_def.node:
|
||||
if node.op == 'Identity' or node.op == 'IdentityN':
|
||||
inp = node.input[0]
|
||||
if inp in identities:
|
||||
identities[node.name] = identities[inp]
|
||||
else:
|
||||
identities[node.name] = inp
|
||||
graph_def.node.remove(node)
|
||||
|
||||
for node in graph_def.node:
|
||||
for i in range(len(node.input)):
|
||||
if node.input[i] in identities:
|
||||
node.input[i] = identities[node.input[i]]
|
||||
|
||||
|
||||
def removeUnusedNodesAndAttrs(to_remove, graph_def):
|
||||
unusedAttrs = ['T', 'Tshape', 'N', 'Tidx', 'Tdim', 'use_cudnn_on_gpu',
|
||||
'Index', 'Tperm', 'is_training', 'Tpaddings']
|
||||
|
||||
removedNodes = []
|
||||
|
||||
for i in reversed(range(len(graph_def.node))):
|
||||
op = graph_def.node[i].op
|
||||
name = graph_def.node[i].name
|
||||
|
||||
if to_remove(name, op):
|
||||
if op != 'Const':
|
||||
removedNodes.append(name)
|
||||
|
||||
del graph_def.node[i]
|
||||
else:
|
||||
for attr in unusedAttrs:
|
||||
if attr in graph_def.node[i].attr:
|
||||
del graph_def.node[i].attr[attr]
|
||||
|
||||
# Remove references to removed nodes except Const nodes.
|
||||
for node in graph_def.node:
|
||||
for i in reversed(range(len(node.input))):
|
||||
if node.input[i] in removedNodes:
|
||||
del node.input[i]
|
||||
|
||||
|
||||
def writeTextGraph(modelPath, outputPath, outNodes):
|
||||
try:
|
||||
import cv2 as cv
|
||||
|
||||
cv.dnn.writeTextGraph(modelPath, outputPath)
|
||||
except:
|
||||
import tensorflow as tf
|
||||
from tensorflow.tools.graph_transforms import TransformGraph
|
||||
|
||||
with tf.gfile.FastGFile(modelPath, 'rb') as f:
|
||||
graph_def = tf.GraphDef()
|
||||
graph_def.ParseFromString(f.read())
|
||||
|
||||
graph_def = TransformGraph(graph_def, ['image_tensor'], outNodes, ['sort_by_execution_order'])
|
||||
|
||||
for node in graph_def.node:
|
||||
if node.op == 'Const':
|
||||
if 'value' in node.attr and node.attr['value'].tensor.tensor_content:
|
||||
node.attr['value'].tensor.tensor_content = b''
|
||||
|
||||
tf.train.write_graph(graph_def, "", outputPath, as_text=True)
|
||||
236
samples/dnn/tf_text_graph_efficientdet.py
Normal file
236
samples/dnn/tf_text_graph_efficientdet.py
Normal file
@@ -0,0 +1,236 @@
|
||||
# This file is a part of OpenCV project.
|
||||
# It is a subject to the license terms in the LICENSE file found in the top-level directory
|
||||
# of this distribution and at http://opencv.org/license.html.
|
||||
#
|
||||
# Copyright (C) 2020, Intel Corporation, all rights reserved.
|
||||
# Third party copyrights are property of their respective owners.
|
||||
#
|
||||
# Use this script to get the text graph representation (.pbtxt) of EfficientDet
|
||||
# deep learning network trained in https://github.com/google/automl.
|
||||
# Then you can import it with a binary frozen graph (.pb) using readNetFromTensorflow() function.
|
||||
# See details and examples on the following wiki page: https://github.com/opencv/opencv/wiki/TensorFlow-Object-Detection-API
|
||||
import argparse
|
||||
import re
|
||||
from math import sqrt
|
||||
from tf_text_graph_common import *
|
||||
|
||||
|
||||
class AnchorGenerator:
|
||||
def __init__(self, min_level, aspect_ratios, num_scales, anchor_scale):
|
||||
self.min_level = min_level
|
||||
self.aspect_ratios = aspect_ratios
|
||||
self.anchor_scale = anchor_scale
|
||||
self.scales = [2**(float(s) / num_scales) for s in range(num_scales)]
|
||||
|
||||
def get(self, layer_id):
|
||||
widths = []
|
||||
heights = []
|
||||
for s in self.scales:
|
||||
for a in self.aspect_ratios:
|
||||
base_anchor_size = 2**(self.min_level + layer_id) * self.anchor_scale
|
||||
heights.append(base_anchor_size * s * a[1])
|
||||
widths.append(base_anchor_size * s * a[0])
|
||||
return widths, heights
|
||||
|
||||
|
||||
def createGraph(modelPath, outputPath, min_level, aspect_ratios, num_scales,
|
||||
anchor_scale, num_classes, image_width, image_height):
|
||||
print('Min level: %d' % min_level)
|
||||
print('Anchor scale: %f' % anchor_scale)
|
||||
print('Num scales: %d' % num_scales)
|
||||
print('Aspect ratios: %s' % str(aspect_ratios))
|
||||
print('Number of classes: %d' % num_classes)
|
||||
print('Input image size: %dx%d' % (image_width, image_height))
|
||||
|
||||
# Read the graph.
|
||||
_inpNames = ['image_arrays']
|
||||
outNames = ['detections']
|
||||
|
||||
writeTextGraph(modelPath, outputPath, outNames)
|
||||
graph_def = parseTextGraph(outputPath)
|
||||
|
||||
def getUnconnectedNodes():
|
||||
unconnected = []
|
||||
for node in graph_def.node:
|
||||
if node.op == 'Const':
|
||||
continue
|
||||
unconnected.append(node.name)
|
||||
for inp in node.input:
|
||||
if inp in unconnected:
|
||||
unconnected.remove(inp)
|
||||
return unconnected
|
||||
|
||||
|
||||
nodesToKeep = ['truediv'] # Keep preprocessing nodes
|
||||
|
||||
removeIdentity(graph_def)
|
||||
|
||||
scopesToKeep = ('image_arrays', 'efficientnet', 'resample_p6', 'resample_p7',
|
||||
'fpn_cells', 'class_net', 'box_net', 'Reshape', 'concat')
|
||||
|
||||
addConstNode('scale_w', [2.0], graph_def)
|
||||
addConstNode('scale_h', [2.0], graph_def)
|
||||
nodesToKeep += ['scale_w', 'scale_h']
|
||||
|
||||
for node in graph_def.node:
|
||||
if re.match('efficientnet-(.*)/blocks_\d+/se/mul_1', node.name):
|
||||
node.input[0], node.input[1] = node.input[1], node.input[0]
|
||||
|
||||
if re.match('fpn_cells/cell_\d+/fnode\d+/resample(.*)/nearest_upsampling/Reshape_1$', node.name):
|
||||
node.op = 'ResizeNearestNeighbor'
|
||||
node.input[1] = 'scale_w'
|
||||
node.input.append('scale_h')
|
||||
|
||||
for inpNode in graph_def.node:
|
||||
if inpNode.name == node.name[:node.name.rfind('_')]:
|
||||
node.input[0] = inpNode.input[0]
|
||||
|
||||
if re.match('box_net/box-predict(_\d)*/separable_conv2d$', node.name):
|
||||
node.addAttr('loc_pred_transposed', True)
|
||||
|
||||
# Replace RealDiv to Mul with inversed scale for compatibility
|
||||
if node.op == 'RealDiv':
|
||||
for inpNode in graph_def.node:
|
||||
if inpNode.name != node.input[1] or not 'value' in inpNode.attr:
|
||||
continue
|
||||
|
||||
tensor = inpNode.attr['value']['tensor'][0]
|
||||
if not 'float_val' in tensor:
|
||||
continue
|
||||
scale = float(inpNode.attr['value']['tensor'][0]['float_val'][0])
|
||||
|
||||
addConstNode(inpNode.name + '/inv', [1.0 / scale], graph_def)
|
||||
nodesToKeep.append(inpNode.name + '/inv')
|
||||
node.input[1] = inpNode.name + '/inv'
|
||||
node.op = 'Mul'
|
||||
break
|
||||
|
||||
|
||||
def to_remove(name, op):
|
||||
if name in nodesToKeep:
|
||||
return False
|
||||
return op == 'Const' or not name.startswith(scopesToKeep)
|
||||
|
||||
removeUnusedNodesAndAttrs(to_remove, graph_def)
|
||||
|
||||
# Attach unconnected preprocessing
|
||||
assert(graph_def.node[1].name == 'truediv' and graph_def.node[1].op == 'RealDiv')
|
||||
graph_def.node[1].input.insert(0, 'image_arrays')
|
||||
graph_def.node[2].input.insert(0, 'truediv')
|
||||
|
||||
priors_generator = AnchorGenerator(min_level, aspect_ratios, num_scales, anchor_scale)
|
||||
priorBoxes = []
|
||||
for i in range(5):
|
||||
inpName = ''
|
||||
for node in graph_def.node:
|
||||
if node.name == 'Reshape_%d' % (i * 2 + 1):
|
||||
inpName = node.input[0]
|
||||
break
|
||||
|
||||
priorBox = NodeDef()
|
||||
priorBox.name = 'PriorBox_%d' % i
|
||||
priorBox.op = 'PriorBox'
|
||||
priorBox.input.append(inpName)
|
||||
priorBox.input.append(graph_def.node[0].name) # image_tensor
|
||||
|
||||
priorBox.addAttr('flip', False)
|
||||
priorBox.addAttr('clip', False)
|
||||
|
||||
widths, heights = priors_generator.get(i)
|
||||
|
||||
priorBox.addAttr('width', widths)
|
||||
priorBox.addAttr('height', heights)
|
||||
priorBox.addAttr('variance', [1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
graph_def.node.extend([priorBox])
|
||||
priorBoxes.append(priorBox.name)
|
||||
|
||||
addConstNode('concat/axis_flatten', [-1], graph_def)
|
||||
|
||||
def addConcatNode(name, inputs, axisNodeName):
|
||||
concat = NodeDef()
|
||||
concat.name = name
|
||||
concat.op = 'ConcatV2'
|
||||
for inp in inputs:
|
||||
concat.input.append(inp)
|
||||
concat.input.append(axisNodeName)
|
||||
graph_def.node.extend([concat])
|
||||
|
||||
addConcatNode('PriorBox/concat', priorBoxes, 'concat/axis_flatten')
|
||||
|
||||
sigmoid = NodeDef()
|
||||
sigmoid.name = 'concat/sigmoid'
|
||||
sigmoid.op = 'Sigmoid'
|
||||
sigmoid.input.append('concat')
|
||||
graph_def.node.extend([sigmoid])
|
||||
|
||||
addFlatten(sigmoid.name, sigmoid.name + '/Flatten', graph_def)
|
||||
addFlatten('concat_1', 'concat_1/Flatten', graph_def)
|
||||
|
||||
detectionOut = NodeDef()
|
||||
detectionOut.name = 'detection_out'
|
||||
detectionOut.op = 'DetectionOutput'
|
||||
|
||||
detectionOut.input.append('concat_1/Flatten')
|
||||
detectionOut.input.append(sigmoid.name + '/Flatten')
|
||||
detectionOut.input.append('PriorBox/concat')
|
||||
|
||||
detectionOut.addAttr('num_classes', num_classes)
|
||||
detectionOut.addAttr('share_location', True)
|
||||
detectionOut.addAttr('background_label_id', num_classes + 1)
|
||||
detectionOut.addAttr('nms_threshold', 0.6)
|
||||
detectionOut.addAttr('confidence_threshold', 0.2)
|
||||
detectionOut.addAttr('top_k', 100)
|
||||
detectionOut.addAttr('keep_top_k', 100)
|
||||
detectionOut.addAttr('code_type', "CENTER_SIZE")
|
||||
graph_def.node.extend([detectionOut])
|
||||
|
||||
graph_def.node[0].attr['shape'] = {
|
||||
'shape': {
|
||||
'dim': [
|
||||
{'size': -1},
|
||||
{'size': image_height},
|
||||
{'size': image_width},
|
||||
{'size': 3}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
while True:
|
||||
unconnectedNodes = getUnconnectedNodes()
|
||||
unconnectedNodes.remove(detectionOut.name)
|
||||
if not unconnectedNodes:
|
||||
break
|
||||
|
||||
for name in unconnectedNodes:
|
||||
for i in range(len(graph_def.node)):
|
||||
if graph_def.node[i].name == name:
|
||||
del graph_def.node[i]
|
||||
break
|
||||
|
||||
# Save as text
|
||||
graph_def.save(outputPath)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Run this script to get a text graph of '
|
||||
'SSD model from TensorFlow Object Detection API. '
|
||||
'Then pass it with .pb file to cv::dnn::readNetFromTensorflow function.')
|
||||
parser.add_argument('--input', required=True, help='Path to frozen TensorFlow graph.')
|
||||
parser.add_argument('--output', required=True, help='Path to output text graph.')
|
||||
parser.add_argument('--min_level', default=3, type=int, help='Parameter from training config')
|
||||
parser.add_argument('--num_scales', default=3, type=int, help='Parameter from training config')
|
||||
parser.add_argument('--anchor_scale', default=4.0, type=float, help='Parameter from training config')
|
||||
parser.add_argument('--aspect_ratios', default=[1.0, 1.0, 1.4, 0.7, 0.7, 1.4],
|
||||
nargs='+', type=float, help='Parameter from training config')
|
||||
parser.add_argument('--num_classes', default=90, type=int, help='Number of classes to detect')
|
||||
parser.add_argument('--width', default=512, type=int, help='Network input width')
|
||||
parser.add_argument('--height', default=512, type=int, help='Network input height')
|
||||
args = parser.parse_args()
|
||||
|
||||
ar = args.aspect_ratios
|
||||
assert(len(ar) % 2 == 0)
|
||||
ar = list(zip(ar[::2], ar[1::2]))
|
||||
|
||||
createGraph(args.input, args.output, args.min_level, ar, args.num_scales,
|
||||
args.anchor_scale, args.num_classes, args.width, args.height)
|
||||
299
samples/dnn/tf_text_graph_faster_rcnn.py
Normal file
299
samples/dnn/tf_text_graph_faster_rcnn.py
Normal file
@@ -0,0 +1,299 @@
|
||||
import argparse
|
||||
import numpy as np
|
||||
from tf_text_graph_common import *
|
||||
|
||||
|
||||
def createFasterRCNNGraph(modelPath, configPath, outputPath):
|
||||
scopesToKeep = ('FirstStageFeatureExtractor', 'Conv',
|
||||
'FirstStageBoxPredictor/BoxEncodingPredictor',
|
||||
'FirstStageBoxPredictor/ClassPredictor',
|
||||
'CropAndResize',
|
||||
'MaxPool2D',
|
||||
'SecondStageFeatureExtractor',
|
||||
'SecondStageBoxPredictor',
|
||||
'Preprocessor/sub',
|
||||
'Preprocessor/mul',
|
||||
'image_tensor')
|
||||
|
||||
scopesToIgnore = ('FirstStageFeatureExtractor/Assert',
|
||||
'FirstStageFeatureExtractor/Shape',
|
||||
'FirstStageFeatureExtractor/strided_slice',
|
||||
'FirstStageFeatureExtractor/GreaterEqual',
|
||||
'FirstStageFeatureExtractor/LogicalAnd')
|
||||
|
||||
# Load a config file.
|
||||
config = readTextMessage(configPath)
|
||||
config = config['model'][0]['faster_rcnn'][0]
|
||||
num_classes = int(config['num_classes'][0])
|
||||
|
||||
grid_anchor_generator = config['first_stage_anchor_generator'][0]['grid_anchor_generator'][0]
|
||||
scales = [float(s) for s in grid_anchor_generator['scales']]
|
||||
aspect_ratios = [float(ar) for ar in grid_anchor_generator['aspect_ratios']]
|
||||
width_stride = float(grid_anchor_generator['width_stride'][0])
|
||||
height_stride = float(grid_anchor_generator['height_stride'][0])
|
||||
|
||||
feature_extractor = config['feature_extractor'][0]
|
||||
if 'type' in feature_extractor and feature_extractor['type'][0] == 'faster_rcnn_nas':
|
||||
features_stride = 16.0
|
||||
else:
|
||||
features_stride = float(feature_extractor['first_stage_features_stride'][0])
|
||||
|
||||
first_stage_nms_iou_threshold = float(config['first_stage_nms_iou_threshold'][0])
|
||||
first_stage_max_proposals = int(config['first_stage_max_proposals'][0])
|
||||
|
||||
print('Number of classes: %d' % num_classes)
|
||||
print('Scales: %s' % str(scales))
|
||||
print('Aspect ratios: %s' % str(aspect_ratios))
|
||||
print('Width stride: %f' % width_stride)
|
||||
print('Height stride: %f' % height_stride)
|
||||
print('Features stride: %f' % features_stride)
|
||||
|
||||
# Read the graph.
|
||||
writeTextGraph(modelPath, outputPath, ['num_detections', 'detection_scores', 'detection_boxes', 'detection_classes'])
|
||||
graph_def = parseTextGraph(outputPath)
|
||||
|
||||
removeIdentity(graph_def)
|
||||
|
||||
nodesToKeep = []
|
||||
def to_remove(name, op):
|
||||
if name in nodesToKeep:
|
||||
return False
|
||||
return op == 'Const' or name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \
|
||||
(name.startswith('CropAndResize') and op != 'CropAndResize')
|
||||
|
||||
# Fuse atrous convolutions (with dilations).
|
||||
nodesMap = {node.name: node for node in graph_def.node}
|
||||
for node in reversed(graph_def.node):
|
||||
if node.op == 'BatchToSpaceND':
|
||||
del node.input[2]
|
||||
conv = nodesMap[node.input[0]]
|
||||
spaceToBatchND = nodesMap[conv.input[0]]
|
||||
|
||||
# Extract paddings
|
||||
stridedSlice = nodesMap[spaceToBatchND.input[2]]
|
||||
assert(stridedSlice.op == 'StridedSlice')
|
||||
pack = nodesMap[stridedSlice.input[0]]
|
||||
assert(pack.op == 'Pack')
|
||||
|
||||
padNodeH = nodesMap[nodesMap[pack.input[0]].input[0]]
|
||||
padNodeW = nodesMap[nodesMap[pack.input[1]].input[0]]
|
||||
padH = int(padNodeH.attr['value']['tensor'][0]['int_val'][0])
|
||||
padW = int(padNodeW.attr['value']['tensor'][0]['int_val'][0])
|
||||
|
||||
paddingsNode = NodeDef()
|
||||
paddingsNode.name = conv.name + '/paddings'
|
||||
paddingsNode.op = 'Const'
|
||||
paddingsNode.addAttr('value', [padH, padH, padW, padW])
|
||||
graph_def.node.insert(graph_def.node.index(spaceToBatchND), paddingsNode)
|
||||
nodesToKeep.append(paddingsNode.name)
|
||||
|
||||
spaceToBatchND.input[2] = paddingsNode.name
|
||||
|
||||
|
||||
removeUnusedNodesAndAttrs(to_remove, graph_def)
|
||||
|
||||
|
||||
# Connect input node to the first layer
|
||||
assert(graph_def.node[0].op == 'Placeholder')
|
||||
graph_def.node[1].input.insert(0, graph_def.node[0].name)
|
||||
|
||||
# Temporarily remove top nodes.
|
||||
topNodes = []
|
||||
while True:
|
||||
node = graph_def.node.pop()
|
||||
topNodes.append(node)
|
||||
if node.op == 'CropAndResize':
|
||||
break
|
||||
|
||||
addReshape('FirstStageBoxPredictor/ClassPredictor/BiasAdd',
|
||||
'FirstStageBoxPredictor/ClassPredictor/reshape_1', [0, -1, 2], graph_def)
|
||||
|
||||
addSoftMax('FirstStageBoxPredictor/ClassPredictor/reshape_1',
|
||||
'FirstStageBoxPredictor/ClassPredictor/softmax', graph_def) # Compare with Reshape_4
|
||||
|
||||
addFlatten('FirstStageBoxPredictor/ClassPredictor/softmax',
|
||||
'FirstStageBoxPredictor/ClassPredictor/softmax/flatten', graph_def)
|
||||
|
||||
# Compare with FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd
|
||||
addFlatten('FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd',
|
||||
'FirstStageBoxPredictor/BoxEncodingPredictor/flatten', graph_def)
|
||||
|
||||
proposals = NodeDef()
|
||||
proposals.name = 'proposals' # Compare with ClipToWindow/Gather/Gather (NOTE: normalized)
|
||||
proposals.op = 'PriorBox'
|
||||
proposals.input.append('FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd')
|
||||
proposals.input.append(graph_def.node[0].name) # image_tensor
|
||||
|
||||
proposals.addAttr('flip', False)
|
||||
proposals.addAttr('clip', True)
|
||||
proposals.addAttr('step', features_stride)
|
||||
proposals.addAttr('offset', 0.0)
|
||||
proposals.addAttr('variance', [0.1, 0.1, 0.2, 0.2])
|
||||
|
||||
widths = []
|
||||
heights = []
|
||||
for a in aspect_ratios:
|
||||
for s in scales:
|
||||
ar = np.sqrt(a)
|
||||
heights.append((height_stride**2) * s / ar)
|
||||
widths.append((width_stride**2) * s * ar)
|
||||
|
||||
proposals.addAttr('width', widths)
|
||||
proposals.addAttr('height', heights)
|
||||
|
||||
graph_def.node.extend([proposals])
|
||||
|
||||
# Compare with Reshape_5
|
||||
detectionOut = NodeDef()
|
||||
detectionOut.name = 'detection_out'
|
||||
detectionOut.op = 'DetectionOutput'
|
||||
|
||||
detectionOut.input.append('FirstStageBoxPredictor/BoxEncodingPredictor/flatten')
|
||||
detectionOut.input.append('FirstStageBoxPredictor/ClassPredictor/softmax/flatten')
|
||||
detectionOut.input.append('proposals')
|
||||
|
||||
detectionOut.addAttr('num_classes', 2)
|
||||
detectionOut.addAttr('share_location', True)
|
||||
detectionOut.addAttr('background_label_id', 0)
|
||||
detectionOut.addAttr('nms_threshold', first_stage_nms_iou_threshold)
|
||||
detectionOut.addAttr('top_k', 6000)
|
||||
detectionOut.addAttr('code_type', "CENTER_SIZE")
|
||||
detectionOut.addAttr('keep_top_k', first_stage_max_proposals)
|
||||
detectionOut.addAttr('clip', False)
|
||||
|
||||
graph_def.node.extend([detectionOut])
|
||||
|
||||
addConstNode('clip_by_value/lower', [0.0], graph_def)
|
||||
addConstNode('clip_by_value/upper', [1.0], graph_def)
|
||||
|
||||
clipByValueNode = NodeDef()
|
||||
clipByValueNode.name = 'detection_out/clip_by_value'
|
||||
clipByValueNode.op = 'ClipByValue'
|
||||
clipByValueNode.input.append('detection_out')
|
||||
clipByValueNode.input.append('clip_by_value/lower')
|
||||
clipByValueNode.input.append('clip_by_value/upper')
|
||||
graph_def.node.extend([clipByValueNode])
|
||||
|
||||
# Save as text.
|
||||
for node in reversed(topNodes):
|
||||
graph_def.node.extend([node])
|
||||
|
||||
addSoftMax('SecondStageBoxPredictor/Reshape_1', 'SecondStageBoxPredictor/Reshape_1/softmax', graph_def)
|
||||
|
||||
addSlice('SecondStageBoxPredictor/Reshape_1/softmax',
|
||||
'SecondStageBoxPredictor/Reshape_1/slice',
|
||||
[0, 0, 1], [-1, -1, -1], graph_def)
|
||||
|
||||
addReshape('SecondStageBoxPredictor/Reshape_1/slice',
|
||||
'SecondStageBoxPredictor/Reshape_1/Reshape', [1, -1], graph_def)
|
||||
|
||||
# Replace Flatten subgraph onto a single node.
|
||||
cropAndResizeNodeName = ''
|
||||
for i in reversed(range(len(graph_def.node))):
|
||||
if graph_def.node[i].op == 'CropAndResize':
|
||||
graph_def.node[i].input.insert(1, 'detection_out/clip_by_value')
|
||||
cropAndResizeNodeName = graph_def.node[i].name
|
||||
|
||||
if graph_def.node[i].name == 'SecondStageBoxPredictor/Reshape':
|
||||
addConstNode('SecondStageBoxPredictor/Reshape/shape2', [1, -1, 4], graph_def)
|
||||
|
||||
graph_def.node[i].input.pop()
|
||||
graph_def.node[i].input.append('SecondStageBoxPredictor/Reshape/shape2')
|
||||
|
||||
if graph_def.node[i].name in ['SecondStageBoxPredictor/Flatten/flatten/Shape',
|
||||
'SecondStageBoxPredictor/Flatten/flatten/strided_slice',
|
||||
'SecondStageBoxPredictor/Flatten/flatten/Reshape/shape',
|
||||
'SecondStageBoxPredictor/Flatten_1/flatten/Shape',
|
||||
'SecondStageBoxPredictor/Flatten_1/flatten/strided_slice',
|
||||
'SecondStageBoxPredictor/Flatten_1/flatten/Reshape/shape']:
|
||||
del graph_def.node[i]
|
||||
|
||||
for node in graph_def.node:
|
||||
if node.name == 'SecondStageBoxPredictor/Flatten/flatten/Reshape' or \
|
||||
node.name == 'SecondStageBoxPredictor/Flatten_1/flatten/Reshape':
|
||||
node.op = 'Flatten'
|
||||
node.input.pop()
|
||||
|
||||
if node.name in ['FirstStageBoxPredictor/BoxEncodingPredictor/Conv2D',
|
||||
'SecondStageBoxPredictor/BoxEncodingPredictor/MatMul']:
|
||||
node.addAttr('loc_pred_transposed', True)
|
||||
|
||||
if node.name.startswith('MaxPool2D'):
|
||||
assert(node.op == 'MaxPool')
|
||||
assert(cropAndResizeNodeName)
|
||||
node.input = [cropAndResizeNodeName]
|
||||
|
||||
################################################################################
|
||||
### Postprocessing
|
||||
################################################################################
|
||||
addSlice('detection_out/clip_by_value', 'detection_out/slice', [0, 0, 0, 3], [-1, -1, -1, 4], graph_def)
|
||||
|
||||
variance = NodeDef()
|
||||
variance.name = 'proposals/variance'
|
||||
variance.op = 'Const'
|
||||
variance.addAttr('value', [0.1, 0.1, 0.2, 0.2])
|
||||
graph_def.node.extend([variance])
|
||||
|
||||
varianceEncoder = NodeDef()
|
||||
varianceEncoder.name = 'variance_encoded'
|
||||
varianceEncoder.op = 'Mul'
|
||||
varianceEncoder.input.append('SecondStageBoxPredictor/Reshape')
|
||||
varianceEncoder.input.append(variance.name)
|
||||
varianceEncoder.addAttr('axis', 2)
|
||||
graph_def.node.extend([varianceEncoder])
|
||||
|
||||
addReshape('detection_out/slice', 'detection_out/slice/reshape', [1, 1, -1], graph_def)
|
||||
addFlatten('variance_encoded', 'variance_encoded/flatten', graph_def)
|
||||
|
||||
detectionOut = NodeDef()
|
||||
detectionOut.name = 'detection_out_final'
|
||||
detectionOut.op = 'DetectionOutput'
|
||||
|
||||
detectionOut.input.append('variance_encoded/flatten')
|
||||
detectionOut.input.append('SecondStageBoxPredictor/Reshape_1/Reshape')
|
||||
detectionOut.input.append('detection_out/slice/reshape')
|
||||
|
||||
detectionOut.addAttr('num_classes', num_classes)
|
||||
detectionOut.addAttr('share_location', False)
|
||||
detectionOut.addAttr('background_label_id', num_classes + 1)
|
||||
detectionOut.addAttr('nms_threshold', 0.6)
|
||||
detectionOut.addAttr('code_type', "CENTER_SIZE")
|
||||
detectionOut.addAttr('keep_top_k', 100)
|
||||
detectionOut.addAttr('clip', True)
|
||||
detectionOut.addAttr('variance_encoded_in_target', True)
|
||||
graph_def.node.extend([detectionOut])
|
||||
|
||||
def getUnconnectedNodes():
|
||||
unconnected = [node.name for node in graph_def.node]
|
||||
for node in graph_def.node:
|
||||
for inp in node.input:
|
||||
if inp in unconnected:
|
||||
unconnected.remove(inp)
|
||||
return unconnected
|
||||
|
||||
while True:
|
||||
unconnectedNodes = getUnconnectedNodes()
|
||||
unconnectedNodes.remove(detectionOut.name)
|
||||
if not unconnectedNodes:
|
||||
break
|
||||
|
||||
for name in unconnectedNodes:
|
||||
for i in range(len(graph_def.node)):
|
||||
if graph_def.node[i].name == name:
|
||||
del graph_def.node[i]
|
||||
break
|
||||
|
||||
# Save as text.
|
||||
graph_def.save(outputPath)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Run this script to get a text graph of '
|
||||
'Faster-RCNN model from TensorFlow Object Detection API. '
|
||||
'Then pass it with .pb file to cv::dnn::readNetFromTensorflow function.')
|
||||
parser.add_argument('--input', required=True, help='Path to frozen TensorFlow graph.')
|
||||
parser.add_argument('--output', required=True, help='Path to output text graph.')
|
||||
parser.add_argument('--config', required=True, help='Path to a *.config file is used for training.')
|
||||
args = parser.parse_args()
|
||||
|
||||
createFasterRCNNGraph(args.input, args.config, args.output)
|
||||
297
samples/dnn/tf_text_graph_mask_rcnn.py
Normal file
297
samples/dnn/tf_text_graph_mask_rcnn.py
Normal file
@@ -0,0 +1,297 @@
|
||||
import argparse
|
||||
import numpy as np
|
||||
from tf_text_graph_common import *
|
||||
|
||||
parser = argparse.ArgumentParser(description='Run this script to get a text graph of '
|
||||
'Mask-RCNN model from TensorFlow Object Detection API. '
|
||||
'Then pass it with .pb file to cv::dnn::readNetFromTensorflow function.')
|
||||
parser.add_argument('--input', required=True, help='Path to frozen TensorFlow graph.')
|
||||
parser.add_argument('--output', required=True, help='Path to output text graph.')
|
||||
parser.add_argument('--config', required=True, help='Path to a *.config file is used for training.')
|
||||
args = parser.parse_args()
|
||||
|
||||
scopesToKeep = ('FirstStageFeatureExtractor', 'Conv',
|
||||
'FirstStageBoxPredictor/BoxEncodingPredictor',
|
||||
'FirstStageBoxPredictor/ClassPredictor',
|
||||
'CropAndResize',
|
||||
'MaxPool2D',
|
||||
'SecondStageFeatureExtractor',
|
||||
'SecondStageBoxPredictor',
|
||||
'Preprocessor/sub',
|
||||
'Preprocessor/mul',
|
||||
'image_tensor')
|
||||
|
||||
scopesToIgnore = ('FirstStageFeatureExtractor/Assert',
|
||||
'FirstStageFeatureExtractor/Shape',
|
||||
'FirstStageFeatureExtractor/strided_slice',
|
||||
'FirstStageFeatureExtractor/GreaterEqual',
|
||||
'FirstStageFeatureExtractor/LogicalAnd',
|
||||
'Conv/required_space_to_batch_paddings')
|
||||
|
||||
# Load a config file.
|
||||
config = readTextMessage(args.config)
|
||||
config = config['model'][0]['faster_rcnn'][0]
|
||||
num_classes = int(config['num_classes'][0])
|
||||
|
||||
grid_anchor_generator = config['first_stage_anchor_generator'][0]['grid_anchor_generator'][0]
|
||||
scales = [float(s) for s in grid_anchor_generator['scales']]
|
||||
aspect_ratios = [float(ar) for ar in grid_anchor_generator['aspect_ratios']]
|
||||
width_stride = float(grid_anchor_generator['width_stride'][0])
|
||||
height_stride = float(grid_anchor_generator['height_stride'][0])
|
||||
features_stride = float(config['feature_extractor'][0]['first_stage_features_stride'][0])
|
||||
first_stage_nms_iou_threshold = float(config['first_stage_nms_iou_threshold'][0])
|
||||
first_stage_max_proposals = int(config['first_stage_max_proposals'][0])
|
||||
|
||||
print('Number of classes: %d' % num_classes)
|
||||
print('Scales: %s' % str(scales))
|
||||
print('Aspect ratios: %s' % str(aspect_ratios))
|
||||
print('Width stride: %f' % width_stride)
|
||||
print('Height stride: %f' % height_stride)
|
||||
print('Features stride: %f' % features_stride)
|
||||
|
||||
# Read the graph.
|
||||
writeTextGraph(args.input, args.output, ['num_detections', 'detection_scores', 'detection_boxes', 'detection_classes', 'detection_masks'])
|
||||
graph_def = parseTextGraph(args.output)
|
||||
|
||||
removeIdentity(graph_def)
|
||||
|
||||
nodesToKeep = []
|
||||
def to_remove(name, op):
|
||||
if name in nodesToKeep:
|
||||
return False
|
||||
return op == 'Const' or name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \
|
||||
(name.startswith('CropAndResize') and op != 'CropAndResize')
|
||||
|
||||
# Fuse atrous convolutions (with dilations).
|
||||
nodesMap = {node.name: node for node in graph_def.node}
|
||||
for node in reversed(graph_def.node):
|
||||
if node.op == 'BatchToSpaceND':
|
||||
del node.input[2]
|
||||
conv = nodesMap[node.input[0]]
|
||||
spaceToBatchND = nodesMap[conv.input[0]]
|
||||
|
||||
paddingsNode = NodeDef()
|
||||
paddingsNode.name = conv.name + '/paddings'
|
||||
paddingsNode.op = 'Const'
|
||||
paddingsNode.addAttr('value', [2, 2, 2, 2])
|
||||
graph_def.node.insert(graph_def.node.index(spaceToBatchND), paddingsNode)
|
||||
nodesToKeep.append(paddingsNode.name)
|
||||
|
||||
spaceToBatchND.input[2] = paddingsNode.name
|
||||
|
||||
removeUnusedNodesAndAttrs(to_remove, graph_def)
|
||||
|
||||
|
||||
# Connect input node to the first layer
|
||||
assert(graph_def.node[0].op == 'Placeholder')
|
||||
graph_def.node[1].input.insert(0, graph_def.node[0].name)
|
||||
|
||||
# Temporarily remove top nodes.
|
||||
topNodes = []
|
||||
numCropAndResize = 0
|
||||
while True:
|
||||
node = graph_def.node.pop()
|
||||
topNodes.append(node)
|
||||
if node.op == 'CropAndResize':
|
||||
numCropAndResize += 1
|
||||
if numCropAndResize == 2:
|
||||
break
|
||||
|
||||
addReshape('FirstStageBoxPredictor/ClassPredictor/BiasAdd',
|
||||
'FirstStageBoxPredictor/ClassPredictor/reshape_1', [0, -1, 2], graph_def)
|
||||
|
||||
addSoftMax('FirstStageBoxPredictor/ClassPredictor/reshape_1',
|
||||
'FirstStageBoxPredictor/ClassPredictor/softmax', graph_def) # Compare with Reshape_4
|
||||
|
||||
addFlatten('FirstStageBoxPredictor/ClassPredictor/softmax',
|
||||
'FirstStageBoxPredictor/ClassPredictor/softmax/flatten', graph_def)
|
||||
|
||||
# Compare with FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd
|
||||
addFlatten('FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd',
|
||||
'FirstStageBoxPredictor/BoxEncodingPredictor/flatten', graph_def)
|
||||
|
||||
proposals = NodeDef()
|
||||
proposals.name = 'proposals' # Compare with ClipToWindow/Gather/Gather (NOTE: normalized)
|
||||
proposals.op = 'PriorBox'
|
||||
proposals.input.append('FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd')
|
||||
proposals.input.append(graph_def.node[0].name) # image_tensor
|
||||
|
||||
proposals.addAttr('flip', False)
|
||||
proposals.addAttr('clip', True)
|
||||
proposals.addAttr('step', features_stride)
|
||||
proposals.addAttr('offset', 0.0)
|
||||
proposals.addAttr('variance', [0.1, 0.1, 0.2, 0.2])
|
||||
|
||||
widths = []
|
||||
heights = []
|
||||
for a in aspect_ratios:
|
||||
for s in scales:
|
||||
ar = np.sqrt(a)
|
||||
heights.append((height_stride**2) * s / ar)
|
||||
widths.append((width_stride**2) * s * ar)
|
||||
|
||||
proposals.addAttr('width', widths)
|
||||
proposals.addAttr('height', heights)
|
||||
|
||||
graph_def.node.extend([proposals])
|
||||
|
||||
# Compare with Reshape_5
|
||||
detectionOut = NodeDef()
|
||||
detectionOut.name = 'detection_out'
|
||||
detectionOut.op = 'DetectionOutput'
|
||||
|
||||
detectionOut.input.append('FirstStageBoxPredictor/BoxEncodingPredictor/flatten')
|
||||
detectionOut.input.append('FirstStageBoxPredictor/ClassPredictor/softmax/flatten')
|
||||
detectionOut.input.append('proposals')
|
||||
|
||||
detectionOut.addAttr('num_classes', 2)
|
||||
detectionOut.addAttr('share_location', True)
|
||||
detectionOut.addAttr('background_label_id', 0)
|
||||
detectionOut.addAttr('nms_threshold', first_stage_nms_iou_threshold)
|
||||
detectionOut.addAttr('top_k', 6000)
|
||||
detectionOut.addAttr('code_type', "CENTER_SIZE")
|
||||
detectionOut.addAttr('keep_top_k', first_stage_max_proposals)
|
||||
detectionOut.addAttr('clip', True)
|
||||
|
||||
graph_def.node.extend([detectionOut])
|
||||
|
||||
# Save as text.
|
||||
cropAndResizeNodesNames = []
|
||||
for node in reversed(topNodes):
|
||||
if node.op != 'CropAndResize':
|
||||
graph_def.node.extend([node])
|
||||
topNodes.pop()
|
||||
else:
|
||||
cropAndResizeNodesNames.append(node.name)
|
||||
if numCropAndResize == 1:
|
||||
break
|
||||
else:
|
||||
graph_def.node.extend([node])
|
||||
topNodes.pop()
|
||||
numCropAndResize -= 1
|
||||
|
||||
addSoftMax('SecondStageBoxPredictor/Reshape_1', 'SecondStageBoxPredictor/Reshape_1/softmax', graph_def)
|
||||
|
||||
addSlice('SecondStageBoxPredictor/Reshape_1/softmax',
|
||||
'SecondStageBoxPredictor/Reshape_1/slice',
|
||||
[0, 0, 1], [-1, -1, -1], graph_def)
|
||||
|
||||
addReshape('SecondStageBoxPredictor/Reshape_1/slice',
|
||||
'SecondStageBoxPredictor/Reshape_1/Reshape', [1, -1], graph_def)
|
||||
|
||||
# Replace Flatten subgraph onto a single node.
|
||||
for i in reversed(range(len(graph_def.node))):
|
||||
if graph_def.node[i].op == 'CropAndResize':
|
||||
graph_def.node[i].input.insert(1, 'detection_out')
|
||||
|
||||
if graph_def.node[i].name == 'SecondStageBoxPredictor/Reshape':
|
||||
addConstNode('SecondStageBoxPredictor/Reshape/shape2', [1, -1, 4], graph_def)
|
||||
|
||||
graph_def.node[i].input.pop()
|
||||
graph_def.node[i].input.append('SecondStageBoxPredictor/Reshape/shape2')
|
||||
|
||||
if graph_def.node[i].name in ['SecondStageBoxPredictor/Flatten/flatten/Shape',
|
||||
'SecondStageBoxPredictor/Flatten/flatten/strided_slice',
|
||||
'SecondStageBoxPredictor/Flatten/flatten/Reshape/shape',
|
||||
'SecondStageBoxPredictor/Flatten_1/flatten/Shape',
|
||||
'SecondStageBoxPredictor/Flatten_1/flatten/strided_slice',
|
||||
'SecondStageBoxPredictor/Flatten_1/flatten/Reshape/shape']:
|
||||
del graph_def.node[i]
|
||||
|
||||
for node in graph_def.node:
|
||||
if node.name == 'SecondStageBoxPredictor/Flatten/flatten/Reshape' or \
|
||||
node.name == 'SecondStageBoxPredictor/Flatten_1/flatten/Reshape':
|
||||
node.op = 'Flatten'
|
||||
node.input.pop()
|
||||
|
||||
if node.name in ['FirstStageBoxPredictor/BoxEncodingPredictor/Conv2D',
|
||||
'SecondStageBoxPredictor/BoxEncodingPredictor/MatMul']:
|
||||
node.addAttr('loc_pred_transposed', True)
|
||||
|
||||
if node.name.startswith('MaxPool2D'):
|
||||
assert(node.op == 'MaxPool')
|
||||
assert(len(cropAndResizeNodesNames) == 2)
|
||||
node.input = [cropAndResizeNodesNames[0]]
|
||||
del cropAndResizeNodesNames[0]
|
||||
|
||||
################################################################################
|
||||
### Postprocessing
|
||||
################################################################################
|
||||
addSlice('detection_out', 'detection_out/slice', [0, 0, 0, 3], [-1, -1, -1, 4], graph_def)
|
||||
|
||||
variance = NodeDef()
|
||||
variance.name = 'proposals/variance'
|
||||
variance.op = 'Const'
|
||||
variance.addAttr('value', [0.1, 0.1, 0.2, 0.2])
|
||||
graph_def.node.extend([variance])
|
||||
|
||||
varianceEncoder = NodeDef()
|
||||
varianceEncoder.name = 'variance_encoded'
|
||||
varianceEncoder.op = 'Mul'
|
||||
varianceEncoder.input.append('SecondStageBoxPredictor/Reshape')
|
||||
varianceEncoder.input.append(variance.name)
|
||||
varianceEncoder.addAttr('axis', 2)
|
||||
graph_def.node.extend([varianceEncoder])
|
||||
|
||||
addReshape('detection_out/slice', 'detection_out/slice/reshape', [1, 1, -1], graph_def)
|
||||
addFlatten('variance_encoded', 'variance_encoded/flatten', graph_def)
|
||||
|
||||
detectionOut = NodeDef()
|
||||
detectionOut.name = 'detection_out_final'
|
||||
detectionOut.op = 'DetectionOutput'
|
||||
|
||||
detectionOut.input.append('variance_encoded/flatten')
|
||||
detectionOut.input.append('SecondStageBoxPredictor/Reshape_1/Reshape')
|
||||
detectionOut.input.append('detection_out/slice/reshape')
|
||||
|
||||
detectionOut.addAttr('num_classes', num_classes)
|
||||
detectionOut.addAttr('share_location', False)
|
||||
detectionOut.addAttr('background_label_id', num_classes + 1)
|
||||
detectionOut.addAttr('nms_threshold', 0.6)
|
||||
detectionOut.addAttr('code_type', "CENTER_SIZE")
|
||||
detectionOut.addAttr('keep_top_k',100)
|
||||
detectionOut.addAttr('clip', True)
|
||||
detectionOut.addAttr('variance_encoded_in_target', True)
|
||||
detectionOut.addAttr('confidence_threshold', 0.3)
|
||||
detectionOut.addAttr('group_by_classes', False)
|
||||
graph_def.node.extend([detectionOut])
|
||||
|
||||
for node in reversed(topNodes):
|
||||
graph_def.node.extend([node])
|
||||
|
||||
if node.name.startswith('MaxPool2D'):
|
||||
assert(node.op == 'MaxPool')
|
||||
assert(len(cropAndResizeNodesNames) == 1)
|
||||
node.input = [cropAndResizeNodesNames[0]]
|
||||
|
||||
for i in reversed(range(len(graph_def.node))):
|
||||
if graph_def.node[i].op == 'CropAndResize':
|
||||
graph_def.node[i].input.insert(1, 'detection_out_final')
|
||||
break
|
||||
|
||||
graph_def.node[-1].name = 'detection_masks'
|
||||
graph_def.node[-1].op = 'Sigmoid'
|
||||
graph_def.node[-1].input.pop()
|
||||
|
||||
def getUnconnectedNodes():
|
||||
unconnected = [node.name for node in graph_def.node]
|
||||
for node in graph_def.node:
|
||||
for inp in node.input:
|
||||
if inp in unconnected:
|
||||
unconnected.remove(inp)
|
||||
return unconnected
|
||||
|
||||
while True:
|
||||
unconnectedNodes = getUnconnectedNodes()
|
||||
unconnectedNodes.remove(graph_def.node[-1].name)
|
||||
if not unconnectedNodes:
|
||||
break
|
||||
|
||||
for name in unconnectedNodes:
|
||||
for i in range(len(graph_def.node)):
|
||||
if graph_def.node[i].name == name:
|
||||
del graph_def.node[i]
|
||||
break
|
||||
|
||||
# Save as text.
|
||||
graph_def.save(args.output)
|
||||
413
samples/dnn/tf_text_graph_ssd.py
Normal file
413
samples/dnn/tf_text_graph_ssd.py
Normal file
@@ -0,0 +1,413 @@
|
||||
# This file is a part of OpenCV project.
|
||||
# It is a subject to the license terms in the LICENSE file found in the top-level directory
|
||||
# of this distribution and at http://opencv.org/license.html.
|
||||
#
|
||||
# Copyright (C) 2018, Intel Corporation, all rights reserved.
|
||||
# Third party copyrights are property of their respective owners.
|
||||
#
|
||||
# Use this script to get the text graph representation (.pbtxt) of SSD-based
|
||||
# deep learning network trained in TensorFlow Object Detection API.
|
||||
# Then you can import it with a binary frozen graph (.pb) using readNetFromTensorflow() function.
|
||||
# See details and examples on the following wiki page: https://github.com/opencv/opencv/wiki/TensorFlow-Object-Detection-API
|
||||
import argparse
|
||||
import re
|
||||
from math import sqrt
|
||||
from tf_text_graph_common import *
|
||||
|
||||
class SSDAnchorGenerator:
|
||||
def __init__(self, min_scale, max_scale, num_layers, aspect_ratios,
|
||||
reduce_boxes_in_lowest_layer, image_width, image_height):
|
||||
self.min_scale = min_scale
|
||||
self.aspect_ratios = aspect_ratios
|
||||
self.reduce_boxes_in_lowest_layer = reduce_boxes_in_lowest_layer
|
||||
self.image_width = image_width
|
||||
self.image_height = image_height
|
||||
self.scales = [min_scale + (max_scale - min_scale) * i / (num_layers - 1)
|
||||
for i in range(num_layers)] + [1.0]
|
||||
|
||||
def get(self, layer_id):
|
||||
if layer_id == 0 and self.reduce_boxes_in_lowest_layer:
|
||||
widths = [0.1, self.min_scale * sqrt(2.0), self.min_scale * sqrt(0.5)]
|
||||
heights = [0.1, self.min_scale / sqrt(2.0), self.min_scale / sqrt(0.5)]
|
||||
else:
|
||||
widths = [self.scales[layer_id] * sqrt(ar) for ar in self.aspect_ratios]
|
||||
heights = [self.scales[layer_id] / sqrt(ar) for ar in self.aspect_ratios]
|
||||
|
||||
widths += [sqrt(self.scales[layer_id] * self.scales[layer_id + 1])]
|
||||
heights += [sqrt(self.scales[layer_id] * self.scales[layer_id + 1])]
|
||||
min_size = min(self.image_width, self.image_height)
|
||||
widths = [w * min_size for w in widths]
|
||||
heights = [h * min_size for h in heights]
|
||||
return widths, heights
|
||||
|
||||
|
||||
class MultiscaleAnchorGenerator:
|
||||
def __init__(self, min_level, aspect_ratios, scales_per_octave, anchor_scale):
|
||||
self.min_level = min_level
|
||||
self.aspect_ratios = aspect_ratios
|
||||
self.anchor_scale = anchor_scale
|
||||
self.scales = [2**(float(s) / scales_per_octave) for s in range(scales_per_octave)]
|
||||
|
||||
def get(self, layer_id):
|
||||
widths = []
|
||||
heights = []
|
||||
for a in self.aspect_ratios:
|
||||
for s in self.scales:
|
||||
base_anchor_size = 2**(self.min_level + layer_id) * self.anchor_scale
|
||||
ar = sqrt(a)
|
||||
heights.append(base_anchor_size * s / ar)
|
||||
widths.append(base_anchor_size * s * ar)
|
||||
return widths, heights
|
||||
|
||||
|
||||
def createSSDGraph(modelPath, configPath, outputPath):
|
||||
# Nodes that should be kept.
|
||||
keepOps = ['Conv2D', 'BiasAdd', 'Add', 'AddV2', 'Relu', 'Relu6', 'Placeholder', 'FusedBatchNorm',
|
||||
'DepthwiseConv2dNative', 'ConcatV2', 'Mul', 'MaxPool', 'AvgPool', 'Identity',
|
||||
'Sub', 'ResizeNearestNeighbor', 'Pad', 'FusedBatchNormV3', 'Mean']
|
||||
|
||||
# Node with which prefixes should be removed
|
||||
prefixesToRemove = ('MultipleGridAnchorGenerator/', 'Concatenate/', 'Postprocessor/', 'Preprocessor/map')
|
||||
|
||||
# Load a config file.
|
||||
config = readTextMessage(configPath)
|
||||
config = config['model'][0]['ssd'][0]
|
||||
num_classes = int(config['num_classes'][0])
|
||||
|
||||
fixed_shape_resizer = config['image_resizer'][0]['fixed_shape_resizer'][0]
|
||||
image_width = int(fixed_shape_resizer['width'][0])
|
||||
image_height = int(fixed_shape_resizer['height'][0])
|
||||
|
||||
box_predictor = 'convolutional' if 'convolutional_box_predictor' in config['box_predictor'][0] else 'weight_shared_convolutional'
|
||||
|
||||
anchor_generator = config['anchor_generator'][0]
|
||||
if 'ssd_anchor_generator' in anchor_generator:
|
||||
ssd_anchor_generator = anchor_generator['ssd_anchor_generator'][0]
|
||||
min_scale = float(ssd_anchor_generator['min_scale'][0])
|
||||
max_scale = float(ssd_anchor_generator['max_scale'][0])
|
||||
num_layers = int(ssd_anchor_generator['num_layers'][0])
|
||||
aspect_ratios = [float(ar) for ar in ssd_anchor_generator['aspect_ratios']]
|
||||
reduce_boxes_in_lowest_layer = True
|
||||
if 'reduce_boxes_in_lowest_layer' in ssd_anchor_generator:
|
||||
reduce_boxes_in_lowest_layer = ssd_anchor_generator['reduce_boxes_in_lowest_layer'][0] == 'true'
|
||||
priors_generator = SSDAnchorGenerator(min_scale, max_scale, num_layers,
|
||||
aspect_ratios, reduce_boxes_in_lowest_layer,
|
||||
image_width, image_height)
|
||||
|
||||
|
||||
print('Scale: [%f-%f]' % (min_scale, max_scale))
|
||||
print('Aspect ratios: %s' % str(aspect_ratios))
|
||||
print('Reduce boxes in the lowest layer: %s' % str(reduce_boxes_in_lowest_layer))
|
||||
elif 'multiscale_anchor_generator' in anchor_generator:
|
||||
multiscale_anchor_generator = anchor_generator['multiscale_anchor_generator'][0]
|
||||
min_level = int(multiscale_anchor_generator['min_level'][0])
|
||||
max_level = int(multiscale_anchor_generator['max_level'][0])
|
||||
anchor_scale = float(multiscale_anchor_generator['anchor_scale'][0])
|
||||
aspect_ratios = [float(ar) for ar in multiscale_anchor_generator['aspect_ratios']]
|
||||
scales_per_octave = int(multiscale_anchor_generator['scales_per_octave'][0])
|
||||
num_layers = max_level - min_level + 1
|
||||
priors_generator = MultiscaleAnchorGenerator(min_level, aspect_ratios,
|
||||
scales_per_octave, anchor_scale)
|
||||
print('Levels: [%d-%d]' % (min_level, max_level))
|
||||
print('Anchor scale: %f' % anchor_scale)
|
||||
print('Scales per octave: %d' % scales_per_octave)
|
||||
print('Aspect ratios: %s' % str(aspect_ratios))
|
||||
else:
|
||||
print('Unknown anchor_generator')
|
||||
exit(0)
|
||||
|
||||
print('Number of classes: %d' % num_classes)
|
||||
print('Number of layers: %d' % num_layers)
|
||||
print('box predictor: %s' % box_predictor)
|
||||
print('Input image size: %dx%d' % (image_width, image_height))
|
||||
|
||||
# Read the graph.
|
||||
outNames = ['num_detections', 'detection_scores', 'detection_boxes', 'detection_classes']
|
||||
|
||||
writeTextGraph(modelPath, outputPath, outNames)
|
||||
graph_def = parseTextGraph(outputPath)
|
||||
|
||||
def getUnconnectedNodes():
|
||||
unconnected = []
|
||||
for node in graph_def.node:
|
||||
unconnected.append(node.name)
|
||||
for inp in node.input:
|
||||
if inp in unconnected:
|
||||
unconnected.remove(inp)
|
||||
return unconnected
|
||||
|
||||
|
||||
def fuse_nodes(nodesToKeep):
|
||||
# Detect unfused batch normalization nodes and fuse them.
|
||||
# Add_0 <-- moving_variance, add_y
|
||||
# Rsqrt <-- Add_0
|
||||
# Mul_0 <-- Rsqrt, gamma
|
||||
# Mul_1 <-- input, Mul_0
|
||||
# Mul_2 <-- moving_mean, Mul_0
|
||||
# Sub_0 <-- beta, Mul_2
|
||||
# Add_1 <-- Mul_1, Sub_0
|
||||
nodesMap = {node.name: node for node in graph_def.node}
|
||||
subgraphBatchNorm = ['Add',
|
||||
['Mul', 'input', ['Mul', ['Rsqrt', ['Add', 'moving_variance', 'add_y']], 'gamma']],
|
||||
['Sub', 'beta', ['Mul', 'moving_mean', 'Mul_0']]]
|
||||
subgraphBatchNormV2 = ['AddV2',
|
||||
['Mul', 'input', ['Mul', ['Rsqrt', ['AddV2', 'moving_variance', 'add_y']], 'gamma']],
|
||||
['Sub', 'beta', ['Mul', 'moving_mean', 'Mul_0']]]
|
||||
# Detect unfused nearest neighbor resize.
|
||||
subgraphResizeNN = ['Reshape',
|
||||
['Mul', ['Reshape', 'input', ['Pack', 'shape_1', 'shape_2', 'shape_3', 'shape_4', 'shape_5']],
|
||||
'ones'],
|
||||
['Pack', ['StridedSlice', ['Shape', 'input'], 'stack', 'stack_1', 'stack_2'],
|
||||
'out_height', 'out_width', 'out_channels']]
|
||||
def checkSubgraph(node, targetNode, inputs, fusedNodes):
|
||||
op = targetNode[0]
|
||||
if node.op == op and (len(node.input) >= len(targetNode) - 1):
|
||||
fusedNodes.append(node)
|
||||
for i, inpOp in enumerate(targetNode[1:]):
|
||||
if isinstance(inpOp, list):
|
||||
if not node.input[i] in nodesMap or \
|
||||
not checkSubgraph(nodesMap[node.input[i]], inpOp, inputs, fusedNodes):
|
||||
return False
|
||||
else:
|
||||
inputs[inpOp] = node.input[i]
|
||||
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
nodesToRemove = []
|
||||
for node in graph_def.node:
|
||||
inputs = {}
|
||||
fusedNodes = []
|
||||
if checkSubgraph(node, subgraphBatchNorm, inputs, fusedNodes) or \
|
||||
checkSubgraph(node, subgraphBatchNormV2, inputs, fusedNodes):
|
||||
name = node.name
|
||||
node.Clear()
|
||||
node.name = name
|
||||
node.op = 'FusedBatchNorm'
|
||||
node.input.append(inputs['input'])
|
||||
node.input.append(inputs['gamma'])
|
||||
node.input.append(inputs['beta'])
|
||||
node.input.append(inputs['moving_mean'])
|
||||
node.input.append(inputs['moving_variance'])
|
||||
node.addAttr('epsilon', 0.001)
|
||||
nodesToRemove += fusedNodes[1:]
|
||||
|
||||
inputs = {}
|
||||
fusedNodes = []
|
||||
if checkSubgraph(node, subgraphResizeNN, inputs, fusedNodes):
|
||||
name = node.name
|
||||
node.Clear()
|
||||
node.name = name
|
||||
node.op = 'ResizeNearestNeighbor'
|
||||
node.input.append(inputs['input'])
|
||||
node.input.append(name + '/output_shape')
|
||||
|
||||
out_height_node = nodesMap[inputs['out_height']]
|
||||
out_width_node = nodesMap[inputs['out_width']]
|
||||
out_height = int(out_height_node.attr['value']['tensor'][0]['int_val'][0])
|
||||
out_width = int(out_width_node.attr['value']['tensor'][0]['int_val'][0])
|
||||
|
||||
shapeNode = NodeDef()
|
||||
shapeNode.name = name + '/output_shape'
|
||||
shapeNode.op = 'Const'
|
||||
shapeNode.addAttr('value', [out_height, out_width])
|
||||
graph_def.node.insert(graph_def.node.index(node), shapeNode)
|
||||
nodesToKeep.append(shapeNode.name)
|
||||
|
||||
nodesToRemove += fusedNodes[1:]
|
||||
for node in nodesToRemove:
|
||||
graph_def.node.remove(node)
|
||||
|
||||
nodesToKeep = []
|
||||
fuse_nodes(nodesToKeep)
|
||||
|
||||
removeIdentity(graph_def)
|
||||
|
||||
def to_remove(name, op):
|
||||
return (not name in nodesToKeep) and \
|
||||
(op == 'Const' or (not op in keepOps) or name.startswith(prefixesToRemove))
|
||||
|
||||
removeUnusedNodesAndAttrs(to_remove, graph_def)
|
||||
|
||||
|
||||
# Connect input node to the first layer
|
||||
assert(graph_def.node[0].op == 'Placeholder')
|
||||
try:
|
||||
input_shape = graph_def.node[0].attr['shape']['shape'][0]['dim']
|
||||
input_shape[1]['size'] = image_height
|
||||
input_shape[2]['size'] = image_width
|
||||
except:
|
||||
print("Input shapes are undefined")
|
||||
# assert(graph_def.node[1].op == 'Conv2D')
|
||||
weights = graph_def.node[1].input[-1]
|
||||
for i in range(len(graph_def.node[1].input)):
|
||||
graph_def.node[1].input.pop()
|
||||
graph_def.node[1].input.append(graph_def.node[0].name)
|
||||
graph_def.node[1].input.append(weights)
|
||||
|
||||
# check and correct the case when preprocessing block is after input
|
||||
preproc_id = "Preprocessor/"
|
||||
if graph_def.node[2].name.startswith(preproc_id) and \
|
||||
graph_def.node[2].input[0].startswith(preproc_id):
|
||||
|
||||
if not any(preproc_id in inp for inp in graph_def.node[3].input):
|
||||
graph_def.node[3].input.insert(0, graph_def.node[2].name)
|
||||
|
||||
|
||||
# Create SSD postprocessing head ###############################################
|
||||
|
||||
# Concatenate predictions of classes, predictions of bounding boxes and proposals.
|
||||
def addConcatNode(name, inputs, axisNodeName):
|
||||
concat = NodeDef()
|
||||
concat.name = name
|
||||
concat.op = 'ConcatV2'
|
||||
for inp in inputs:
|
||||
concat.input.append(inp)
|
||||
concat.input.append(axisNodeName)
|
||||
graph_def.node.extend([concat])
|
||||
|
||||
addConstNode('concat/axis_flatten', [-1], graph_def)
|
||||
addConstNode('PriorBox/concat/axis', [-2], graph_def)
|
||||
|
||||
for label in ['ClassPredictor', 'BoxEncodingPredictor' if box_predictor is 'convolutional' else 'BoxPredictor']:
|
||||
concatInputs = []
|
||||
for i in range(num_layers):
|
||||
# Flatten predictions
|
||||
flatten = NodeDef()
|
||||
if box_predictor is 'convolutional':
|
||||
inpName = 'BoxPredictor_%d/%s/BiasAdd' % (i, label)
|
||||
else:
|
||||
if i == 0:
|
||||
inpName = 'WeightSharedConvolutionalBoxPredictor/%s/BiasAdd' % label
|
||||
else:
|
||||
inpName = 'WeightSharedConvolutionalBoxPredictor_%d/%s/BiasAdd' % (i, label)
|
||||
flatten.input.append(inpName)
|
||||
flatten.name = inpName + '/Flatten'
|
||||
flatten.op = 'Flatten'
|
||||
|
||||
concatInputs.append(flatten.name)
|
||||
graph_def.node.extend([flatten])
|
||||
addConcatNode('%s/concat' % label, concatInputs, 'concat/axis_flatten')
|
||||
|
||||
num_matched_layers = 0
|
||||
for node in graph_def.node:
|
||||
if re.match('BoxPredictor_\d/BoxEncodingPredictor/convolution', node.name) or \
|
||||
re.match('BoxPredictor_\d/BoxEncodingPredictor/Conv2D', node.name) or \
|
||||
re.match('WeightSharedConvolutionalBoxPredictor(_\d)*/BoxPredictor/Conv2D', node.name):
|
||||
node.addAttr('loc_pred_transposed', True)
|
||||
num_matched_layers += 1
|
||||
assert(num_matched_layers == num_layers)
|
||||
|
||||
# Add layers that generate anchors (bounding boxes proposals).
|
||||
priorBoxes = []
|
||||
boxCoder = config['box_coder'][0]
|
||||
fasterRcnnBoxCoder = boxCoder['faster_rcnn_box_coder'][0]
|
||||
boxCoderVariance = [1.0/float(fasterRcnnBoxCoder['x_scale'][0]), 1.0/float(fasterRcnnBoxCoder['y_scale'][0]), 1.0/float(fasterRcnnBoxCoder['width_scale'][0]), 1.0/float(fasterRcnnBoxCoder['height_scale'][0])]
|
||||
for i in range(num_layers):
|
||||
priorBox = NodeDef()
|
||||
priorBox.name = 'PriorBox_%d' % i
|
||||
priorBox.op = 'PriorBox'
|
||||
if box_predictor is 'convolutional':
|
||||
priorBox.input.append('BoxPredictor_%d/BoxEncodingPredictor/BiasAdd' % i)
|
||||
else:
|
||||
if i == 0:
|
||||
priorBox.input.append('WeightSharedConvolutionalBoxPredictor/BoxPredictor/Conv2D')
|
||||
else:
|
||||
priorBox.input.append('WeightSharedConvolutionalBoxPredictor_%d/BoxPredictor/BiasAdd' % i)
|
||||
priorBox.input.append(graph_def.node[0].name) # image_tensor
|
||||
|
||||
priorBox.addAttr('flip', False)
|
||||
priorBox.addAttr('clip', False)
|
||||
|
||||
widths, heights = priors_generator.get(i)
|
||||
|
||||
priorBox.addAttr('width', widths)
|
||||
priorBox.addAttr('height', heights)
|
||||
priorBox.addAttr('variance', boxCoderVariance)
|
||||
|
||||
graph_def.node.extend([priorBox])
|
||||
priorBoxes.append(priorBox.name)
|
||||
|
||||
# Compare this layer's output with Postprocessor/Reshape
|
||||
addConcatNode('PriorBox/concat', priorBoxes, 'concat/axis_flatten')
|
||||
|
||||
# Sigmoid for classes predictions and DetectionOutput layer
|
||||
addReshape('ClassPredictor/concat', 'ClassPredictor/concat3d', [0, -1, num_classes + 1], graph_def)
|
||||
|
||||
sigmoid = NodeDef()
|
||||
sigmoid.name = 'ClassPredictor/concat/sigmoid'
|
||||
sigmoid.op = 'Sigmoid'
|
||||
sigmoid.input.append('ClassPredictor/concat3d')
|
||||
graph_def.node.extend([sigmoid])
|
||||
|
||||
addFlatten(sigmoid.name, sigmoid.name + '/Flatten', graph_def)
|
||||
|
||||
detectionOut = NodeDef()
|
||||
detectionOut.name = 'detection_out'
|
||||
detectionOut.op = 'DetectionOutput'
|
||||
|
||||
if box_predictor == 'convolutional':
|
||||
detectionOut.input.append('BoxEncodingPredictor/concat')
|
||||
else:
|
||||
detectionOut.input.append('BoxPredictor/concat')
|
||||
detectionOut.input.append(sigmoid.name + '/Flatten')
|
||||
detectionOut.input.append('PriorBox/concat')
|
||||
|
||||
detectionOut.addAttr('num_classes', num_classes + 1)
|
||||
detectionOut.addAttr('share_location', True)
|
||||
detectionOut.addAttr('background_label_id', 0)
|
||||
|
||||
postProcessing = config['post_processing'][0]
|
||||
batchNMS = postProcessing['batch_non_max_suppression'][0]
|
||||
|
||||
if 'iou_threshold' in batchNMS:
|
||||
detectionOut.addAttr('nms_threshold', float(batchNMS['iou_threshold'][0]))
|
||||
else:
|
||||
detectionOut.addAttr('nms_threshold', 0.6)
|
||||
|
||||
if 'score_threshold' in batchNMS:
|
||||
detectionOut.addAttr('confidence_threshold', float(batchNMS['score_threshold'][0]))
|
||||
else:
|
||||
detectionOut.addAttr('confidence_threshold', 0.01)
|
||||
|
||||
if 'max_detections_per_class' in batchNMS:
|
||||
detectionOut.addAttr('top_k', int(batchNMS['max_detections_per_class'][0]))
|
||||
else:
|
||||
detectionOut.addAttr('top_k', 100)
|
||||
|
||||
if 'max_total_detections' in batchNMS:
|
||||
detectionOut.addAttr('keep_top_k', int(batchNMS['max_total_detections'][0]))
|
||||
else:
|
||||
detectionOut.addAttr('keep_top_k', 100)
|
||||
|
||||
detectionOut.addAttr('code_type', "CENTER_SIZE")
|
||||
|
||||
graph_def.node.extend([detectionOut])
|
||||
|
||||
while True:
|
||||
unconnectedNodes = getUnconnectedNodes()
|
||||
unconnectedNodes.remove(detectionOut.name)
|
||||
if not unconnectedNodes:
|
||||
break
|
||||
|
||||
for name in unconnectedNodes:
|
||||
for i in range(len(graph_def.node)):
|
||||
if graph_def.node[i].name == name:
|
||||
del graph_def.node[i]
|
||||
break
|
||||
|
||||
# Save as text.
|
||||
graph_def.save(outputPath)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Run this script to get a text graph of '
|
||||
'SSD model from TensorFlow Object Detection API. '
|
||||
'Then pass it with .pb file to cv::dnn::readNetFromTensorflow function.')
|
||||
parser.add_argument('--input', required=True, help='Path to frozen TensorFlow graph.')
|
||||
parser.add_argument('--output', required=True, help='Path to output text graph.')
|
||||
parser.add_argument('--config', required=True, help='Path to a *.config file is used for training.')
|
||||
args = parser.parse_args()
|
||||
|
||||
createSSDGraph(args.input, args.config, args.output)
|
||||
465
samples/dnn/virtual_try_on.py
Normal file
465
samples/dnn/virtual_try_on.py
Normal file
@@ -0,0 +1,465 @@
|
||||
#!/usr/bin/env python3
|
||||
'''
|
||||
You can download the Geometric Matching Module model from https://www.dropbox.com/s/tyhc73xa051grjp/cp_vton_gmm.onnx?dl=0
|
||||
You can download the Try-On Module model from https://www.dropbox.com/s/q2x97ve2h53j66k/cp_vton_tom.onnx?dl=0
|
||||
You can download the cloth segmentation model from https://www.dropbox.com/s/qag9vzambhhkvxr/lip_jppnet_384.pb?dl=0
|
||||
You can find the OpenPose proto in opencv_extra/testdata/dnn/openpose_pose_coco.prototxt
|
||||
and get .caffemodel using opencv_extra/testdata/dnn/download_models.py
|
||||
'''
|
||||
|
||||
import argparse
|
||||
import os.path
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
|
||||
from numpy import linalg
|
||||
from common import findFile
|
||||
from human_parsing import parse_human
|
||||
|
||||
backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV)
|
||||
targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL)
|
||||
|
||||
parser = argparse.ArgumentParser(description='Use this script to run virtial try-on using CP-VTON',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--input_image', '-i', required=True, help='Path to image with person.')
|
||||
parser.add_argument('--input_cloth', '-c', required=True, help='Path to target cloth image')
|
||||
parser.add_argument('--gmm_model', '-gmm', default='cp_vton_gmm.onnx', help='Path to Geometric Matching Module .onnx model.')
|
||||
parser.add_argument('--tom_model', '-tom', default='cp_vton_tom.onnx', help='Path to Try-On Module .onnx model.')
|
||||
parser.add_argument('--segmentation_model', default='lip_jppnet_384.pb', help='Path to cloth segmentation .pb model.')
|
||||
parser.add_argument('--openpose_proto', default='openpose_pose_coco.prototxt', help='Path to OpenPose .prototxt model was trained on COCO dataset.')
|
||||
parser.add_argument('--openpose_model', default='openpose_pose_coco.caffemodel', help='Path to OpenPose .caffemodel model was trained on COCO dataset.')
|
||||
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
|
||||
help="Choose one of computation backends: "
|
||||
"%d: automatically (by default), "
|
||||
"%d: Halide language (http://halide-lang.org/), "
|
||||
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"%d: OpenCV implementation" % backends)
|
||||
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
|
||||
help='Choose one of target computation devices: '
|
||||
'%d: CPU target (by default), '
|
||||
'%d: OpenCL, '
|
||||
'%d: OpenCL fp16 (half-float precision), '
|
||||
'%d: NCS2 VPU, '
|
||||
'%d: HDDL VPU' % targets)
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
|
||||
def get_pose_map(image, proto_path, model_path, backend, target, height=256, width=192):
|
||||
radius = 5
|
||||
inp = cv.dnn.blobFromImage(image, 1.0 / 255, (width, height))
|
||||
|
||||
net = cv.dnn.readNet(proto_path, model_path)
|
||||
net.setPreferableBackend(backend)
|
||||
net.setPreferableTarget(target)
|
||||
net.setInput(inp)
|
||||
out = net.forward()
|
||||
|
||||
threshold = 0.1
|
||||
_, out_c, out_h, out_w = out.shape
|
||||
pose_map = np.zeros((height, width, out_c - 1))
|
||||
# last label: Background
|
||||
for i in range(0, out.shape[1] - 1):
|
||||
heatMap = out[0, i, :, :]
|
||||
keypoint = np.full((height, width), -1)
|
||||
_, conf, _, point = cv.minMaxLoc(heatMap)
|
||||
x = width * point[0] // out_w
|
||||
y = height * point[1] // out_h
|
||||
if conf > threshold and x > 0 and y > 0:
|
||||
keypoint[y - radius:y + radius, x - radius:x + radius] = 1
|
||||
pose_map[:, :, i] = keypoint
|
||||
|
||||
pose_map = pose_map.transpose(2, 0, 1)
|
||||
return pose_map
|
||||
|
||||
|
||||
class BilinearFilter(object):
|
||||
"""
|
||||
PIL bilinear resize implementation
|
||||
image = image.resize((image_width // 16, image_height // 16), Image.BILINEAR)
|
||||
"""
|
||||
def _precompute_coeffs(self, inSize, outSize):
|
||||
filterscale = max(1.0, inSize / outSize)
|
||||
ksize = int(np.ceil(filterscale)) * 2 + 1
|
||||
|
||||
kk = np.zeros(shape=(outSize * ksize, ), dtype=np.float32)
|
||||
bounds = np.empty(shape=(outSize * 2, ), dtype=np.int32)
|
||||
|
||||
centers = (np.arange(outSize) + 0.5) * filterscale + 0.5
|
||||
bounds[::2] = np.where(centers - filterscale < 0, 0, centers - filterscale)
|
||||
bounds[1::2] = np.where(centers + filterscale > inSize, inSize, centers + filterscale) - bounds[::2]
|
||||
xmins = bounds[::2] - centers + 1
|
||||
|
||||
points = np.array([np.arange(row) + xmins[i] for i, row in enumerate(bounds[1::2])]) / filterscale
|
||||
for xx in range(0, outSize):
|
||||
point = points[xx]
|
||||
bilinear = np.where(point < 1.0, 1.0 - abs(point), 0.0)
|
||||
ww = np.sum(bilinear)
|
||||
kk[xx * ksize : xx * ksize + bilinear.size] = np.where(ww == 0.0, bilinear, bilinear / ww)
|
||||
return bounds, kk, ksize
|
||||
|
||||
def _resample_horizontal(self, out, img, ksize, bounds, kk):
|
||||
for yy in range(0, out.shape[0]):
|
||||
for xx in range(0, out.shape[1]):
|
||||
xmin = bounds[xx * 2 + 0]
|
||||
xmax = bounds[xx * 2 + 1]
|
||||
k = kk[xx * ksize : xx * ksize + xmax]
|
||||
out[yy, xx] = np.round(np.sum(img[yy, xmin : xmin + xmax] * k))
|
||||
|
||||
def _resample_vertical(self, out, img, ksize, bounds, kk):
|
||||
for yy in range(0, out.shape[0]):
|
||||
ymin = bounds[yy * 2 + 0]
|
||||
ymax = bounds[yy * 2 + 1]
|
||||
k = kk[yy * ksize: yy * ksize + ymax]
|
||||
out[yy] = np.round(np.sum(img[ymin : ymin + ymax, 0:out.shape[1]] * k[:, np.newaxis], axis=0))
|
||||
|
||||
def imaging_resample(self, img, xsize, ysize):
|
||||
height, width = img.shape[0:2]
|
||||
bounds_horiz, kk_horiz, ksize_horiz = self._precompute_coeffs(width, xsize)
|
||||
bounds_vert, kk_vert, ksize_vert = self._precompute_coeffs(height, ysize)
|
||||
|
||||
out_hor = np.empty((img.shape[0], xsize), dtype=np.uint8)
|
||||
self._resample_horizontal(out_hor, img, ksize_horiz, bounds_horiz, kk_horiz)
|
||||
out = np.empty((ysize, xsize), dtype=np.uint8)
|
||||
self._resample_vertical(out, out_hor, ksize_vert, bounds_vert, kk_vert)
|
||||
return out
|
||||
|
||||
|
||||
class CpVton(object):
|
||||
def __init__(self, gmm_model, tom_model, backend, target):
|
||||
super(CpVton, self).__init__()
|
||||
self.gmm_net = cv.dnn.readNet(gmm_model)
|
||||
self.tom_net = cv.dnn.readNet(tom_model)
|
||||
self.gmm_net.setPreferableBackend(backend)
|
||||
self.gmm_net.setPreferableTarget(target)
|
||||
self.tom_net.setPreferableBackend(backend)
|
||||
self.tom_net.setPreferableTarget(target)
|
||||
|
||||
def prepare_agnostic(self, segm_image, input_image, pose_map, height=256, width=192):
|
||||
palette = {
|
||||
'Background' : (0, 0, 0),
|
||||
'Hat' : (128, 0, 0),
|
||||
'Hair' : (255, 0, 0),
|
||||
'Glove' : (0, 85, 0),
|
||||
'Sunglasses' : (170, 0, 51),
|
||||
'UpperClothes' : (255, 85, 0),
|
||||
'Dress' : (0, 0, 85),
|
||||
'Coat' : (0, 119, 221),
|
||||
'Socks' : (85, 85, 0),
|
||||
'Pants' : (0, 85, 85),
|
||||
'Jumpsuits' : (85, 51, 0),
|
||||
'Scarf' : (52, 86, 128),
|
||||
'Skirt' : (0, 128, 0),
|
||||
'Face' : (0, 0, 255),
|
||||
'Left-arm' : (51, 170, 221),
|
||||
'Right-arm' : (0, 255, 255),
|
||||
'Left-leg' : (85, 255, 170),
|
||||
'Right-leg' : (170, 255, 85),
|
||||
'Left-shoe' : (255, 255, 0),
|
||||
'Right-shoe' : (255, 170, 0)
|
||||
}
|
||||
color2label = {val: key for key, val in palette.items()}
|
||||
head_labels = ['Hat', 'Hair', 'Sunglasses', 'Face', 'Pants', 'Skirt']
|
||||
|
||||
segm_image = cv.cvtColor(segm_image, cv.COLOR_BGR2RGB)
|
||||
phead = np.zeros((1, height, width), dtype=np.float32)
|
||||
pose_shape = np.zeros((height, width), dtype=np.uint8)
|
||||
for r in range(height):
|
||||
for c in range(width):
|
||||
pixel = tuple(segm_image[r, c])
|
||||
if tuple(pixel) in color2label:
|
||||
if color2label[pixel] in head_labels:
|
||||
phead[0, r, c] = 1
|
||||
if color2label[pixel] != 'Background':
|
||||
pose_shape[r, c] = 255
|
||||
|
||||
input_image = cv.dnn.blobFromImage(input_image, 1.0 / 127.5, (width, height), mean=(127.5, 127.5, 127.5), swapRB=True)
|
||||
input_image = input_image.squeeze(0)
|
||||
|
||||
img_head = input_image * phead - (1 - phead)
|
||||
|
||||
downsample = BilinearFilter()
|
||||
down = downsample.imaging_resample(pose_shape, width // 16, height // 16)
|
||||
res_shape = cv.resize(down, (width, height), cv.INTER_LINEAR)
|
||||
|
||||
res_shape = cv.dnn.blobFromImage(res_shape, 1.0 / 127.5, mean=(127.5, 127.5, 127.5), swapRB=True)
|
||||
res_shape = res_shape.squeeze(0)
|
||||
|
||||
agnostic = np.concatenate((res_shape, img_head, pose_map), axis=0)
|
||||
agnostic = np.expand_dims(agnostic, axis=0)
|
||||
return agnostic.astype(np.float32)
|
||||
|
||||
def get_warped_cloth(self, cloth_img, agnostic, height=256, width=192):
|
||||
cloth = cv.dnn.blobFromImage(cloth_img, 1.0 / 127.5, (width, height), mean=(127.5, 127.5, 127.5), swapRB=True)
|
||||
|
||||
self.gmm_net.setInput(agnostic, "input.1")
|
||||
self.gmm_net.setInput(cloth, "input.18")
|
||||
theta = self.gmm_net.forward()
|
||||
|
||||
grid = self._generate_grid(theta)
|
||||
warped_cloth = self._bilinear_sampler(cloth, grid).astype(np.float32)
|
||||
return warped_cloth
|
||||
|
||||
def get_tryon(self, agnostic, warp_cloth):
|
||||
inp = np.concatenate([agnostic, warp_cloth], axis=1)
|
||||
self.tom_net.setInput(inp)
|
||||
out = self.tom_net.forward()
|
||||
|
||||
p_rendered, m_composite = np.split(out, [3], axis=1)
|
||||
p_rendered = np.tanh(p_rendered)
|
||||
m_composite = 1 / (1 + np.exp(-m_composite))
|
||||
|
||||
p_tryon = warp_cloth * m_composite + p_rendered * (1 - m_composite)
|
||||
rgb_p_tryon = cv.cvtColor(p_tryon.squeeze(0).transpose(1, 2, 0), cv.COLOR_BGR2RGB)
|
||||
rgb_p_tryon = (rgb_p_tryon + 1) / 2
|
||||
return rgb_p_tryon
|
||||
|
||||
def _compute_L_inverse(self, X, Y):
|
||||
N = X.shape[0]
|
||||
|
||||
Xmat = np.tile(X, (1, N))
|
||||
Ymat = np.tile(Y, (1, N))
|
||||
P_dist_squared = np.power(Xmat - Xmat.transpose(1, 0), 2) + np.power(Ymat - Ymat.transpose(1, 0), 2)
|
||||
|
||||
P_dist_squared[P_dist_squared == 0] = 1
|
||||
K = np.multiply(P_dist_squared, np.log(P_dist_squared))
|
||||
|
||||
O = np.ones([N, 1], dtype=np.float32)
|
||||
Z = np.zeros([3, 3], dtype=np.float32)
|
||||
P = np.concatenate([O, X, Y], axis=1)
|
||||
first = np.concatenate((K, P), axis=1)
|
||||
second = np.concatenate((P.transpose(1, 0), Z), axis=1)
|
||||
L = np.concatenate((first, second), axis=0)
|
||||
Li = linalg.inv(L)
|
||||
return Li
|
||||
|
||||
def _prepare_to_transform(self, out_h=256, out_w=192, grid_size=5):
|
||||
grid_X, grid_Y = np.meshgrid(np.linspace(-1, 1, out_w), np.linspace(-1, 1, out_h))
|
||||
grid_X = np.expand_dims(np.expand_dims(grid_X, axis=0), axis=3)
|
||||
grid_Y = np.expand_dims(np.expand_dims(grid_Y, axis=0), axis=3)
|
||||
|
||||
axis_coords = np.linspace(-1, 1, grid_size)
|
||||
N = grid_size ** 2
|
||||
P_Y, P_X = np.meshgrid(axis_coords, axis_coords)
|
||||
|
||||
P_X = np.reshape(P_X,(-1, 1))
|
||||
P_Y = np.reshape(P_Y,(-1, 1))
|
||||
|
||||
P_X = np.expand_dims(np.expand_dims(np.expand_dims(P_X, axis=2), axis=3), axis=4).transpose(4, 1, 2, 3, 0)
|
||||
P_Y = np.expand_dims(np.expand_dims(np.expand_dims(P_Y, axis=2), axis=3), axis=4).transpose(4, 1, 2, 3, 0)
|
||||
return grid_X, grid_Y, N, P_X, P_Y
|
||||
|
||||
def _expand_torch(self, X, shape):
|
||||
if len(X.shape) != len(shape):
|
||||
return X.flatten().reshape(shape)
|
||||
else:
|
||||
axis = [1 if src == dst else dst for src, dst in zip(X.shape, shape)]
|
||||
return np.tile(X, axis)
|
||||
|
||||
def _apply_transformation(self, theta, points, N, P_X, P_Y):
|
||||
if len(theta.shape) == 2:
|
||||
theta = np.expand_dims(np.expand_dims(theta, axis=2), axis=3)
|
||||
|
||||
batch_size = theta.shape[0]
|
||||
|
||||
P_X_base = np.copy(P_X)
|
||||
P_Y_base = np.copy(P_Y)
|
||||
|
||||
Li = self._compute_L_inverse(np.reshape(P_X, (N, -1)), np.reshape(P_Y, (N, -1)))
|
||||
Li = np.expand_dims(Li, axis=0)
|
||||
|
||||
# split theta into point coordinates
|
||||
Q_X = np.squeeze(theta[:, :N, :, :], axis=3)
|
||||
Q_Y = np.squeeze(theta[:, N:, :, :], axis=3)
|
||||
|
||||
Q_X += self._expand_torch(P_X_base, Q_X.shape)
|
||||
Q_Y += self._expand_torch(P_Y_base, Q_Y.shape)
|
||||
|
||||
points_b = points.shape[0]
|
||||
points_h = points.shape[1]
|
||||
points_w = points.shape[2]
|
||||
|
||||
P_X = self._expand_torch(P_X, (1, points_h, points_w, 1, N))
|
||||
P_Y = self._expand_torch(P_Y, (1, points_h, points_w, 1, N))
|
||||
|
||||
W_X = self._expand_torch(Li[:,:N,:N], (batch_size, N, N)) @ Q_X
|
||||
W_Y = self._expand_torch(Li[:,:N,:N], (batch_size, N, N)) @ Q_Y
|
||||
|
||||
W_X = np.expand_dims(np.expand_dims(W_X, axis=3), axis=4).transpose(0, 4, 2, 3, 1)
|
||||
W_X = np.repeat(W_X, points_h, axis=1)
|
||||
W_X = np.repeat(W_X, points_w, axis=2)
|
||||
|
||||
W_Y = np.expand_dims(np.expand_dims(W_Y, axis=3), axis=4).transpose(0, 4, 2, 3, 1)
|
||||
W_Y = np.repeat(W_Y, points_h, axis=1)
|
||||
W_Y = np.repeat(W_Y, points_w, axis=2)
|
||||
|
||||
A_X = self._expand_torch(Li[:, N:, :N], (batch_size, 3, N)) @ Q_X
|
||||
A_Y = self._expand_torch(Li[:, N:, :N], (batch_size, 3, N)) @ Q_Y
|
||||
|
||||
A_X = np.expand_dims(np.expand_dims(A_X, axis=3), axis=4).transpose(0, 4, 2, 3, 1)
|
||||
A_X = np.repeat(A_X, points_h, axis=1)
|
||||
A_X = np.repeat(A_X, points_w, axis=2)
|
||||
|
||||
A_Y = np.expand_dims(np.expand_dims(A_Y, axis=3), axis=4).transpose(0, 4, 2, 3, 1)
|
||||
A_Y = np.repeat(A_Y, points_h, axis=1)
|
||||
A_Y = np.repeat(A_Y, points_w, axis=2)
|
||||
|
||||
points_X_for_summation = np.expand_dims(np.expand_dims(points[:, :, :, 0], axis=3), axis=4)
|
||||
points_X_for_summation = self._expand_torch(points_X_for_summation, points[:, :, :, 0].shape + (1, N))
|
||||
|
||||
points_Y_for_summation = np.expand_dims(np.expand_dims(points[:, :, :, 1], axis=3), axis=4)
|
||||
points_Y_for_summation = self._expand_torch(points_Y_for_summation, points[:, :, :, 0].shape + (1, N))
|
||||
|
||||
if points_b == 1:
|
||||
delta_X = points_X_for_summation - P_X
|
||||
delta_Y = points_Y_for_summation - P_Y
|
||||
else:
|
||||
delta_X = points_X_for_summation - self._expand_torch(P_X, points_X_for_summation.shape)
|
||||
delta_Y = points_Y_for_summation - self._expand_torch(P_Y, points_Y_for_summation.shape)
|
||||
|
||||
dist_squared = np.power(delta_X, 2) + np.power(delta_Y, 2)
|
||||
dist_squared[dist_squared == 0] = 1
|
||||
U = np.multiply(dist_squared, np.log(dist_squared))
|
||||
|
||||
points_X_batch = np.expand_dims(points[:,:,:,0], axis=3)
|
||||
points_Y_batch = np.expand_dims(points[:,:,:,1], axis=3)
|
||||
|
||||
if points_b == 1:
|
||||
points_X_batch = self._expand_torch(points_X_batch, (batch_size, ) + points_X_batch.shape[1:])
|
||||
points_Y_batch = self._expand_torch(points_Y_batch, (batch_size, ) + points_Y_batch.shape[1:])
|
||||
|
||||
points_X_prime = A_X[:,:,:,:,0]+ \
|
||||
np.multiply(A_X[:,:,:,:,1], points_X_batch) + \
|
||||
np.multiply(A_X[:,:,:,:,2], points_Y_batch) + \
|
||||
np.sum(np.multiply(W_X, self._expand_torch(U, W_X.shape)), 4)
|
||||
|
||||
points_Y_prime = A_Y[:,:,:,:,0]+ \
|
||||
np.multiply(A_Y[:,:,:,:,1], points_X_batch) + \
|
||||
np.multiply(A_Y[:,:,:,:,2], points_Y_batch) + \
|
||||
np.sum(np.multiply(W_Y, self._expand_torch(U, W_Y.shape)), 4)
|
||||
|
||||
return np.concatenate((points_X_prime, points_Y_prime), 3)
|
||||
|
||||
def _generate_grid(self, theta):
|
||||
grid_X, grid_Y, N, P_X, P_Y = self._prepare_to_transform()
|
||||
warped_grid = self._apply_transformation(theta, np.concatenate((grid_X, grid_Y), axis=3), N, P_X, P_Y)
|
||||
return warped_grid
|
||||
|
||||
def _bilinear_sampler(self, img, grid):
|
||||
x, y = grid[:,:,:,0], grid[:,:,:,1]
|
||||
|
||||
H = img.shape[2]
|
||||
W = img.shape[3]
|
||||
max_y = H - 1
|
||||
max_x = W - 1
|
||||
|
||||
# rescale x and y to [0, W-1/H-1]
|
||||
x = 0.5 * (x + 1.0) * (max_x - 1)
|
||||
y = 0.5 * (y + 1.0) * (max_y - 1)
|
||||
|
||||
# grab 4 nearest corner points for each (x_i, y_i)
|
||||
x0 = np.floor(x).astype(int)
|
||||
x1 = x0 + 1
|
||||
y0 = np.floor(y).astype(int)
|
||||
y1 = y0 + 1
|
||||
|
||||
# calculate deltas
|
||||
wa = (x1 - x) * (y1 - y)
|
||||
wb = (x1 - x) * (y - y0)
|
||||
wc = (x - x0) * (y1 - y)
|
||||
wd = (x - x0) * (y - y0)
|
||||
|
||||
# clip to range [0, H-1/W-1] to not violate img boundaries
|
||||
x0 = np.clip(x0, 0, max_x)
|
||||
x1 = np.clip(x1, 0, max_x)
|
||||
y0 = np.clip(y0, 0, max_y)
|
||||
y1 = np.clip(y1, 0, max_y)
|
||||
|
||||
# get pixel value at corner coords
|
||||
img = img.reshape(-1, H, W)
|
||||
Ia = img[:, y0, x0].swapaxes(0, 1)
|
||||
Ib = img[:, y1, x0].swapaxes(0, 1)
|
||||
Ic = img[:, y0, x1].swapaxes(0, 1)
|
||||
Id = img[:, y1, x1].swapaxes(0, 1)
|
||||
|
||||
wa = np.expand_dims(wa, axis=0)
|
||||
wb = np.expand_dims(wb, axis=0)
|
||||
wc = np.expand_dims(wc, axis=0)
|
||||
wd = np.expand_dims(wd, axis=0)
|
||||
|
||||
# compute output
|
||||
out = wa*Ia + wb*Ib + wc*Ic + wd*Id
|
||||
return out
|
||||
|
||||
|
||||
class CorrelationLayer(object):
|
||||
def __init__(self, params, blobs):
|
||||
super(CorrelationLayer, self).__init__()
|
||||
|
||||
def getMemoryShapes(self, inputs):
|
||||
fetureAShape = inputs[0]
|
||||
b, _, h, w = fetureAShape
|
||||
return [[b, h * w, h, w]]
|
||||
|
||||
def forward(self, inputs):
|
||||
feature_A, feature_B = inputs
|
||||
b, c, h, w = feature_A.shape
|
||||
feature_A = feature_A.transpose(0, 1, 3, 2)
|
||||
feature_A = np.reshape(feature_A, (b, c, h * w))
|
||||
feature_B = np.reshape(feature_B, (b, c, h * w))
|
||||
feature_B = feature_B.transpose(0, 2, 1)
|
||||
feature_mul = feature_B @ feature_A
|
||||
feature_mul= np.reshape(feature_mul, (b, h, w, h * w))
|
||||
feature_mul = feature_mul.transpose(0, 1, 3, 2)
|
||||
correlation_tensor = feature_mul.transpose(0, 2, 1, 3)
|
||||
correlation_tensor = np.ascontiguousarray(correlation_tensor)
|
||||
return [correlation_tensor]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not os.path.isfile(args.gmm_model):
|
||||
raise OSError("GMM model not exist")
|
||||
if not os.path.isfile(args.tom_model):
|
||||
raise OSError("TOM model not exist")
|
||||
if not os.path.isfile(args.segmentation_model):
|
||||
raise OSError("Segmentation model not exist")
|
||||
if not os.path.isfile(findFile(args.openpose_proto)):
|
||||
raise OSError("OpenPose proto not exist")
|
||||
if not os.path.isfile(findFile(args.openpose_model)):
|
||||
raise OSError("OpenPose model not exist")
|
||||
|
||||
person_img = cv.imread(args.input_image)
|
||||
ratio = 256 / 192
|
||||
inp_h, inp_w, _ = person_img.shape
|
||||
current_ratio = inp_h / inp_w
|
||||
if current_ratio > ratio:
|
||||
center_h = inp_h // 2
|
||||
out_h = inp_w * ratio
|
||||
start = int(center_h - out_h // 2)
|
||||
end = int(center_h + out_h // 2)
|
||||
person_img = person_img[start:end, ...]
|
||||
else:
|
||||
center_w = inp_w // 2
|
||||
out_w = inp_h / ratio
|
||||
start = int(center_w - out_w // 2)
|
||||
end = int(center_w + out_w // 2)
|
||||
person_img = person_img[:, start:end, :]
|
||||
|
||||
cloth_img = cv.imread(args.input_cloth)
|
||||
pose = get_pose_map(person_img, findFile(args.openpose_proto),
|
||||
findFile(args.openpose_model), args.backend, args.target)
|
||||
segm_image = parse_human(person_img, args.segmentation_model)
|
||||
segm_image = cv.resize(segm_image, (192, 256), cv.INTER_LINEAR)
|
||||
|
||||
cv.dnn_registerLayer('Correlation', CorrelationLayer)
|
||||
|
||||
model = CpVton(args.gmm_model, args.tom_model, args.backend, args.target)
|
||||
agnostic = model.prepare_agnostic(segm_image, person_img, pose)
|
||||
warped_cloth = model.get_warped_cloth(cloth_img, agnostic)
|
||||
output = model.get_tryon(agnostic, warped_cloth)
|
||||
|
||||
cv.dnn_unregisterLayer('Correlation')
|
||||
|
||||
winName = 'Virtual Try-On'
|
||||
cv.namedWindow(winName, cv.WINDOW_AUTOSIZE)
|
||||
cv.imshow(winName, output)
|
||||
cv.waitKey()
|
||||
Reference in New Issue
Block a user