Demo 52: Computer vision with ESP32 Camera

1. Introduction

I have made many demos for ESP32-Camera

Demo 47: Deep learning - Computer vision with ESP32 and tensorflow.js

Demo 48: Using WebSocket for camera live stream

Demo 49: Using HTTP for camera live stream and bring it to the world

Demo 50: Bring Tensorflow Lite to ESP32 Arduino - person detection application using deep learning with ESP32 CAM

Demo 51: Building a Smart Home system with Home Assistant using Raspberry Pi and ESP32/ESP8266 (A to Y)

Today I will show you how to make Computer vision applications using ESP32 CAM. As you knew ESP32 is a strong MCU but it is not strong enough to run a computer vision algorithm directly although Google has published a Tensorflow (Tensorflow CAPI or tensorflow.js) version for ESP32 (refer Demo 47 and Demo 50 above)

There is another way to overcome this. That is using ESP32 as a Camera and streaming video (Demo 48 and Demo 49) to a server for processing and returning the result back to clients. The server can be Cloud or local server. This is a DIY project so I will use local server for demo.

The libraries are needed for this demo:

- Python3 (aiohttp) : HTTP server for Python

- Python3 Opencv : famous computer vision library

Install the necessary libraries for Python3 (you need to install python3 and pip3 - standard packages manager for Python3):

pip3 install --user aiohttp

pip3 install --user numpy

pip3 install --user cv2

pip3 install --user jinja2

pip3 install --user aiohttp_jinja2

pip3 install --user os

pip3 install --user asyncio

pip3 install --user base64

The Diagram of the Demo:

In the figure, the ESP32 Camera continues streaming data to local server via Websocket.

The local server applies computer vision to streaming data and returns the result to Web browser via Websocket.

2 Hardware

A ESP32-Camera module

A USB to Serial for flashing firmware for ESP32-Cam

3. Software

The ESP32 Arduino needs the Websocket Client library to communicate with local server.

This library needs the newest ESP32-Arduino to operate.

The ESP32-Camera code:

#include <Arduino.h>
#include <WiFi.h>
#include <WebSocketsClient.h>
#include "esp_camera.h"

#define PWDN_GPIO_NUM     32
#define RESET_GPIO_NUM    -1
#define XCLK_GPIO_NUM      0
#define SIOD_GPIO_NUM     26
#define SIOC_GPIO_NUM     27

#define Y9_GPIO_NUM       35
#define Y8_GPIO_NUM       34
#define Y7_GPIO_NUM       39
#define Y6_GPIO_NUM       36
#define Y5_GPIO_NUM       21
#define Y4_GPIO_NUM       19
#define Y3_GPIO_NUM       18
#define Y2_GPIO_NUM        5
#define VSYNC_GPIO_NUM    25
#define HREF_GPIO_NUM     23
#define PCLK_GPIO_NUM     22

void configCamera(){
  camera_config_t config;
  config.ledc_channel = LEDC_CHANNEL_0;
  config.ledc_timer = LEDC_TIMER_0;
  config.pin_d0 = Y2_GPIO_NUM;
  config.pin_d1 = Y3_GPIO_NUM;
  config.pin_d2 = Y4_GPIO_NUM;
  config.pin_d3 = Y5_GPIO_NUM;
  config.pin_d4 = Y6_GPIO_NUM;
  config.pin_d5 = Y7_GPIO_NUM;
  config.pin_d6 = Y8_GPIO_NUM;
  config.pin_d7 = Y9_GPIO_NUM;
  config.pin_xclk = XCLK_GPIO_NUM;
  config.pin_pclk = PCLK_GPIO_NUM;
  config.pin_vsync = VSYNC_GPIO_NUM;
  config.pin_href = HREF_GPIO_NUM;
  config.pin_sscb_sda = SIOD_GPIO_NUM;
  config.pin_sscb_scl = SIOC_GPIO_NUM;
  config.pin_pwdn = PWDN_GPIO_NUM;
  config.pin_reset = RESET_GPIO_NUM;
  config.xclk_freq_hz = 20000000;
  config.pixel_format = PIXFORMAT_JPEG;

  config.frame_size = FRAMESIZE_QVGA;
  config.jpeg_quality = 9;
  config.fb_count = 1;

  esp_err_t err = esp_camera_init(&config);
  if (err != ESP_OK) {
    Serial.printf("Camera init failed with error 0x%x", err);
    return;
  }
}

WebSocketsClient webSocket;
 
const char *ssid      = "xyz";
const char *password  = "xyz";
const char *serverIp  = "192.168.1.8";
const char *camId     = "living room";
bool connected = false;

 
void webSocketEvent(WStype_t type, uint8_t * payload, size_t length) {
  switch(type) {
    case WStype_DISCONNECTED:
      Serial.printf("Disconnected!\n");
      connected = false;
      break;
    case WStype_CONNECTED: {
      Serial.printf("[WSc] Connected to url: %s\n", payload);
      connected = true;
    }
        break;
    case WStype_TEXT:
      Serial.printf("RESPONSE: %s\n", payload);
      break;
    case WStype_BIN:
    case WStype_PING:
    case WStype_PONG:
    case WStype_ERROR:
    case WStype_FRAGMENT_TEXT_START:
    case WStype_FRAGMENT_BIN_START:
    case WStype_FRAGMENT:
    case WStype_FRAGMENT_FIN:
        break;
  }
}

void liveCam(){
  //capture a frame
  camera_fb_t * fb = esp_camera_fb_get();
  if (!fb) {
      Serial.println("Frame buffer could not be acquired");
      return;
  }
  //send to server
  String start = "start:" + String(camId);
  webSocket.sendTXT(start);
  webSocket.sendBIN(fb->buf, fb->len);
  webSocket.sendTXT("end");

  //return the frame buffer back to be reused
  esp_camera_fb_return(fb);
}

void setup() {
    Serial.begin(115200);
    WiFi.begin(ssid, password);
 
    while ( WiFi.status() != WL_CONNECTED ) {
      delay ( 500 );
      Serial.print ( "." );
    }
    Serial.print("Local IP: "); Serial.println(WiFi.localIP());
    // server address, port and URL
    webSocket.begin(serverIp, 8080, "/ws");
    // event handler
    webSocket.onEvent(webSocketEvent);
    //config Camera
    configCamera();
}

void loop() {
    webSocket.loop();
    if (connected){
      liveCam();
      delay(100);
    }
}

The server is to start a http server that listening 2 websockets and a HTTP sources. The first websocket will listen the ESP32-Camera Websocket client for streaming Camera data. The HTTP will serve the GUI. The second websocket will listen the GUI websocket to return the processed result.

The "cv2_processing" function uses opencv haarcascade to detect faces and eyes. And draws the circles on detected faces and eyes.

The ".xml" files can be downloaded here:

https://github.com/opencv/opencv/tree/master/data/haarcascades

from aiohttp import web
import aiohttp
import threading
import queue
import numpy as np
import cv2
import jinja2
import aiohttp_jinja2
import os
import asyncio
import base64

cams_queue = queue.Queue(maxsize=10)
monitor_queue = []

face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
eyes_cascade = cv2.CascadeClassifier('haarcascade_eye_tree_eyeglasses.xml')

def cv2_processing(frame):
    try:
        frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame_gray = cv2.equalizeHist(frame_gray)
        #-- Detect faces
        faces = face_cascade.detectMultiScale(frame_gray)
        for (x,y,w,h) in faces:
            center = (x + w//2, y + h//2)
            frame = cv2.ellipse(frame, center, (w//2, h//2), 0, 0, 360, (255, 0, 255), 4)
            faceROI = frame_gray[y:y+h,x:x+w]
            #-- In each face, detect eyes
            eyes = eyes_cascade.detectMultiScale(faceROI)
            for (x2,y2,w2,h2) in eyes:
                eye_center = (x + x2 + w2//2, y + y2 + h2//2)
                radius = int(round((w2 + h2)*0.25))
                frame = cv2.circle(frame, eye_center, radius, (255, 0, 0 ), 4)
    except:
        pass
    return frame

def handle_cams(cams_queue, monitor_queue):
    loop = asyncio.new_event_loop()
    mons = []
    while True:
        if not cams_queue.empty():
            cam = cams_queue.get()
            cam_id = cam[0]
            image_bytes = np.array(cam[1]).tobytes()
            image_bytes = np.frombuffer(image_bytes, dtype=np.uint8) 
            img = cv2.imdecode(image_bytes, flags=cv2.IMREAD_COLOR)
            img = cv2_processing(img)
            retval, buffer = cv2.imencode('.jpg', img)
            jpg_as_text = base64.b64encode(buffer).decode("utf-8")
            for ws in monitor_queue:
                try:
                    loop.run_until_complete(ws.send_str(jpg_as_text))
                except:
                    print('websocket closed')

async def websocket_cam_handler(request):
    ws = web.WebSocketResponse()
    await ws.prepare(request)
    data = []
    cam_id = None
    async for msg in ws:
        if msg.type == aiohttp.WSMsgType.TEXT:
            if msg.data.startswith('start'):
                cam_id = msg.data.replace('start:', '')
                data = []
            elif msg.data == 'end':
                cams_queue.put([cam_id, data])
        elif msg.type == aiohttp.WSMsgType.BINARY:
            data.append(msg.data)
        elif msg.type == aiohttp.WSMsgType.ERROR:
            await ws.close()
            print(ws.exception())
    return ws

async def monitor_cam_handler(request):
    ws = web.WebSocketResponse()
    await ws.prepare(request)
    monitor_queue.append(ws)
    async for msg in ws:
        if msg.type == aiohttp.WSMsgType.TEXT:
            if msg.data == 'close':
                await ws.close()
        elif msg.type == aiohttp.WSMsgType.ERROR:
            await ws.close()
            print(ws.exception())
    return ws

@aiohttp_jinja2.template("index.html")
async def index(request):
    return {}

if __name__ == '__main__':
    p = threading.Thread(target=handle_cams, args=((cams_queue, monitor_queue)))
    p.start()

    app = web.Application()
    aiohttp_jinja2.setup(
        app, loader=jinja2.FileSystemLoader(os.path.join(os.getcwd(), "templates"))
    )
    app.router.add_get('/', index, name="index")
    app.router.add_get('/ws', websocket_cam_handler)
    app.router.add_get('/monitor', monitor_cam_handler)

    web.run_app(app)

The GUI for Web browser is in HTML

<html>
	<head>
		<title> ESP32 Camera Computer Vision </title>
		<script src='http://code.jquery.com/jquery-1.9.1.min.js'></script>
	</head>
	<body>
		<img id='live' src=''>
	</body>
</html>

<script>

jQuery(function($){
	if (!('WebSocket' in window)) {
		alert('Your browser does not support web sockets');
	}else{
		setup()
	}
	function setup(){
		var host = 'ws://192.168.1.8:8080/monitor';
		var socket = new WebSocket(host);
		socket.binaryType = 'arraybuffer';
		if(socket){
				socket.onopen = function() {
				}
				socket.onmessage = function(msg){
					var bytes = msg.data;
					var img = document.getElementById('live');
					img.src = 'data:image/jpg;base64,'+bytes;
				}
				socket.onclose = function(){
					showServerResponse('The connection has been closed.');
				}
		}
	}
});
</script>