OpenCV / Python：用于实时面部识别的多线程答案

【问题标题】：OpenCV / Python : multi-threading for live facial recognitionOpenCV / Python：用于实时面部识别的多线程
【发布时间】：2017-07-06 03:57:10
【问题描述】：

我正在使用 OpenCv 和 Dlib 来执行带有地标的面部识别，通过网络摄像头流进行直播。语言是 Python。它在我的 macbook 笔记本电脑上运行良好，但我需要它从台式计算机 24/7 运行。该计算机是运行 Debian Jessie 的 PC Intel® Core™2 Quad CPU Q6600 @ 2.40GHz 32bit。 性能急剧下降：由于处理，有 10 秒的延迟！

因此，我研究了 多线程 以获得性能：

我首先尝试了OpenCv的示例代码，结果很棒！四个核心都达到了 100%，性能要好得多。
然后我用我的代码替换了帧处理代码，它根本没有提高性能！只有一个核心达到 100%，其他核心保持非常低。 我什至认为启用多线程会更糟。

我从 dlib 示例代码中获得了面部标志性代码。我知道它可能可以进行优化，但我想了解为什么我无法通过多线程使用我的（旧）计算机的全部功能？

我会把我的代码放在下面，非常感谢阅读:)

from __future__ import print_function

import numpy as np
import cv2
import dlib

from multiprocessing.pool import ThreadPool
from collections import deque

from common import clock, draw_str, StatValue
import video

class DummyTask:
    def __init__(self, data):
        self.data = data
    def ready(self):
        return True
    def get(self):
        return self.data

if __name__ == '__main__':
    import sys

    print(__doc__)

    try:
        fn = sys.argv[1]
    except:
        fn = 0
    cap = video.create_capture(fn)
    
    #Face detector
    detector = dlib.get_frontal_face_detector()

    #Landmarks shape predictor 
    predictor = dlib.shape_predictor("landmarks/shape_predictor_68_face_landmarks.dat")

    # This is where the facial detection takes place
    def process_frame(frame, t0, detector, predictor):
        # some intensive computation...
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        clahe_image = clahe.apply(gray)
        detections = detector(clahe_image, 1)
        for k,d in enumerate(detections): 
            shape = predictor(clahe_image, d) 
            for i in range(1,68): #There are 68 landmark points on each face
               cv2.circle(frame, (shape.part(i).x, shape.part(i).y), 1, (0,0,255), thickness=2)
        return frame, t0

    threadn = cv2.getNumberOfCPUs()
    pool = ThreadPool(processes = threadn)
    pending = deque()

    threaded_mode = True

    latency = StatValue()
    frame_interval = StatValue()
    last_frame_time = clock()
    while True:
        while len(pending) > 0 and pending[0].ready():
            res, t0 = pending.popleft().get()
            latency.update(clock() - t0)
            draw_str(res, (20, 20), "threaded      :  " + str(threaded_mode))
            draw_str(res, (20, 40), "latency        :  %.1f ms" % (latency.value*1000))
            draw_str(res, (20, 60), "frame interval :  %.1f ms" % (frame_interval.value*1000))
            cv2.imshow('threaded video', res)
        if len(pending) < threadn:
            ret, frame = cap.read()
            t = clock()
            frame_interval.update(t - last_frame_time)
            last_frame_time = t
            if threaded_mode:
                task = pool.apply_async(process_frame, (frame.copy(), t, detector, predictor))
            else:
                task = DummyTask(process_frame(frame, t, detector, predictor))
            pending.append(task)
        ch = cv2.waitKey(1)
        if ch == ord(' '):
            threaded_mode = not threaded_mode
        if ch == 27:
            break
cv2.destroyAllWindows()

【问题讨论】：

标签： python multithreading opencv face-detection dlib

【解决方案1】：

你可以使用这个，多线程的：

from imutils.video import VideoStream

# Initialize multithreading the video stream.
videostream = "rtsp://192.168.x.y/user=admin=xxxxxxx_channel=vvvv=1.sdp?params"
vs = VideoStream(src=videostream, resolution=frameSize,
                 framerate=32).start()

frame = vs.read()

【讨论】：

【解决方案2】：

我尝试了一种简化的方法，例如 P.Ro 在他的回答中提到的进程写入输出队列，但不知何故，队列大部分时间都被锁定，因为所有进程同时写入它。（只是我的猜测）我可能做错了什么。

最后我最终使用了管道。

代码很糟糕。但如果我是几个小时前的我。我仍然很高兴找到一个实际运行不费力的例子。

from multiprocessing import Process, Queue, Manager,Pipe
import multiprocessing
import face_recognition as fik
import cv2
import time


video_input = 0

obama_image = fik.load_image_file("obama.png")
obama_face_encoding = fik.face_encodings(obama_image)[0]



quality = 0.7


def f(id,fi,fl):
    import face_recognition as fok

    while True:
        small_frame = fi.get()
        print("running thread"+str(id))
        face_locations = fok.face_locations(small_frame)

        if(len(face_locations)>0):
            print(face_locations)
            for (top7, right7, bottom7, left7) in face_locations:

                small_frame_c = small_frame[top7:bottom7, left7:right7]
                fl.send(small_frame_c)

fps_var =0
if __name__ == '__main__':
        multiprocessing.set_start_method('spawn')


        # global megaman
        with Manager() as manager:

            video_capture = cv2.VideoCapture(video_input)

            fi = Queue(maxsize=14)

            threads = 8
            proc = []

            parent_p = []
            thread_p = []
            # procids = range(0,threads)
            for t in range(0,threads):
                p_t,c_t = Pipe()
                parent_p.append(p_t)
                thread_p.append(c_t)
                print(t)
                proc.append(Process(target=f, args=(t,fi,thread_p[t])))
                proc[t].start()


            useframe = False

            frame_id = 0
            while True:
                # Grab a single frame of video
                ret, frame = video_capture.read()
                effheight, effwidth = frame.shape[:2]
                if effwidth < 20:
                    break
                # Resize frame of video to 1/4 size for faster face recognition processing
                xxx = 930
                yyy = 10/16 #0.4234375
                small_frame = cv2.resize(frame, (xxx, int(xxx*yyy)))
                if frame_id%2 == 0:
                    if not fi.full():


                        fi.put(small_frame)

                        print(frame_id)

                        cv2.imshow('Video', small_frame)


                        print("FPS: ", int(1.0 / (time.time() - fps_var)))
                        fps_var = time.time()


                #GET ALL DETECTIONS
                for t in range(0,threads):
                    if parent_p[t].poll():
                        small_frame_c = parent_p[t].recv()
                        cv2.imshow('recc', small_frame_c)
                        height34, width34 = small_frame_c.shape[:2]
                        # print fsizeee
                        if(width34<20):
                            print("face 2 small")
                            print(width34)
                            break
                        face_encodings_cam = fik.face_encodings(small_frame_c,[(0, width34, height34, 0)])

                        match = fik.compare_faces([obama_face_encoding], face_encodings_cam[0])
                        name = "Unknown"

                        if match[0]:
                            name = "Barack"

                        print(name)
                        break

                frame_id += 1

                # Hit 'q' on the keyboard to quit!
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break

【讨论】：

【解决方案3】：

性能问题是由于 dlib 编译错误造成的。 不要使用 pip install dlib 与正确编译相比，由于某种原因运行非常缓慢。通过这种方式，我从将近 10 秒的滞后时间缩短到了 2 秒左右。所以最后我不需要多线程/处理，但我正在努力提高速度。感谢您的帮助:)

【讨论】：

【解决方案4】：

没有太多使用线程池的经验，但我总是只使用如下所示的进程。您应该能够轻松地编辑此代码以满足您的需求。我写这篇文章时考虑到了你的实现。

此代码将获取内核数量并启动许多将并行实现所需功能的工作进程。它们都共享一个输入帧队列，并都放入相同的输出队列以供主要获取和显示。每个队列都有一个最大大小，在本例中为 5。这确保了尽管处理需要 CPU 时间，但它始终是相对活跃的时间。

import numpy as np
import cv2

from multiprocessing import Process, Queue
import time

#from common import clock, draw_str, StatValue
#import video

class Canny_Process(Process):
    
    def __init__(self,frame_queue,output_queue):
        Process.__init__(self)
        self.frame_queue = frame_queue
        self.output_queue = output_queue
        self.stop = False
        #Initialize your face detectors here
        

    def get_frame(self):
        if not self.frame_queue.empty():
            return True, self.frame_queue.get()
        else:
            return False, None

    def stopProcess(self):
        self.stop = True
            
    def canny_frame(self,frame):
        # some intensive computation...
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        edges = cv2.Canny(gray, 50, 100)
        
        #To simulate CPU Time
        #############################
        for i in range(1000000):
            x = 546*546
            res = x/(i+1)
        #############################
        'REPLACE WITH FACE DETECT CODE HERE'

        if self.output_queue.full(): 
            self.output_queue.get_nowait()
        self.output_queue.put(edges)

    def run(self):
        while not self.stop: 
            ret, frame = self.get_frame()
            if ret: 
                self.canny_frame(frame)


if __name__ == '__main__':

    frame_sum = 0
    init_time = time.time()

    def put_frame(frame):
        if Input_Queue.full(): 
            Input_Queue.get_nowait()
        Input_Queue.put(frame)

    def cap_read(cv2_cap):
        ret, frame = cv2_cap.read()
        if ret: 
            put_frame(frame)
        
    cap = cv2.VideoCapture(0)

    threadn = cv2.getNumberOfCPUs()

    threaded_mode = True

    process_list = []
    Input_Queue = Queue(maxsize = 5)
    Output_Queue = Queue(maxsize = 5)

    for x in range((threadn -1)):    
        canny_process = Canny_Process(frame_queue = Input_Queue,output_queue = Output_Queue)
        canny_process.daemon = True
        canny_process.start()
        process_list.append(canny_process)

    ch = cv2.waitKey(1)
    cv2.namedWindow('Threaded Video', cv2.WINDOW_NORMAL)
    while True:        
        cap_read(cap)
        
        if not Output_Queue.empty():
            result = Output_Queue.get()
            cv2.imshow('Threaded Video', result)
            ch = cv2.waitKey(5)

        if ch == ord(' '):
            threaded_mode = not threaded_mode
        if ch == 27:
            break
    cv2.destroyAllWindows()

这应该可以解决问题，只需更改我的精明函数来进行面部检测。我用你的代码写了这个并比较了两者。这明显更快。我在这里使用 multiprocessing.Process 。在 python 中，进程是真正并行的，而线程并不完全是因为 GIL。我正在使用 2 个队列在主进程和进程之间来回发送数据。队列是线程和进程安全的。

【讨论】：

非常感谢您花时间编写所有代码 :) 由于某种原因它不太有效，这周我将深入研究它。我刚换了一台 i7-2600K @ @3.40Ghz 的电脑，但我仍然遇到性能问题！我猜瓶颈在别的地方。 也许在 OSX 与 Debian 架构中的某个地方？ 因为 OSX 一切正常！再次感谢，我会在本周通知你我的结果:)
什么不太好用，代码还是性能？代码是否正在运行但仍然不够快，或者代码根本没有运行？如果你给我错误，我可能会告诉你它是什么。谢谢
代码运行但未按预期运行。我已将日志放入此文件中：docs.google.com/document/d/… 再说一次，我还没有时间深入研究它，明天我会这样做。再次感谢:)