Interpreting Hand Gestures and Sign Language in the Webcam with AI using TensorFlow.js

Raphael Mun

4.93/5 (12 votes)

Jul 15, 2020

CPOL

3 min read

12357

In this article, we will take photos of different hand gestures via webcam and use transfer learning on a pre-trained MobileNet model to build a computer vision AI that can recognize the various gestures in real time.

TensorFlow + JavaScript. The most popular, cutting-edge AI framework now supports the most widely used programming language on the planet, so let’s make magic happen through deep learning right in our web browser, GPU-accelerated via WebGL using TensorFlow.js!

Starting Point

To recognize multiple hand gestures, we are going to use almost-ready starter code and expand it to detect more categories of objects. Here is what the code will do:

Import TensorFlow.js and TensorFlow’s tf-data.js
Define Touch vs. Not-Touch category labels
Add a video element for the webcam
Run the model prediction every 200 ms after it’s been trained for the first time
Show the prediction result
Load a pre-trained MobileNet model and prepare for transfer learning to as many categories as there are labels
Train and classify a variety of custom objects in images
Skip disposing image and target samples in the training process to keep them for multiple training runs

Here is our starting point for this project:

<html>
    <head>
        <meta charset="UTF-8">
        <title>Interpreting Hand Gestures and Sign Language in the Webcam with AI using TensorFlow.js</title>
        <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs@2.0.0/dist/tf.min.js"></script>
        <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs-data@2.0.0/dist/tf-data.min.js"></script>
        <style>
            img, video {
                object-fit: cover;
            }
        </style>
    </head>
    <body>
        <video autoplay playsinline muted id="webcam" width="224" height="224"></video>
        <div id="buttons">
            <button onclick="captureSample(0)">None</button>
            <button onclick="captureSample(1)">✊ (Rock)</button>
            <button onclick="captureSample(2)">🖐 (Paper)</button>
            <button onclick="captureSample(3)">✌️ (Scissors)</button>
            <button onclick="trainModel()">Train</button>
        </div>
        <h1 id="status">Loading...</h1>
        <script>
        let trainingData = [];

        const labels = [
            "None",
            "✊ (Rock)",
            "🖐 (Paper)",
            "✌️ (Scissors)",
        ];

        function setText( text ) {
            document.getElementById( "status" ).innerText = text;
        }

        async function predictImage() {
            if( !hasTrained ) { return; } // Skip prediction until trained
            const img = await getWebcamImage();
            let result = tf.tidy( () => {
                const input = img.reshape( [ 1, 224, 224, 3 ] );
                return model.predict( input );
            });
            img.dispose();
            let prediction = await result.data();
            result.dispose();
            // Get the index of the highest value in the prediction
            let id = prediction.indexOf( Math.max( ...prediction ) );
            setText( labels[ id ] );
        }

        function createTransferModel( model ) {
            // Create the truncated base model (remove the "top" layers, classification + bottleneck layers)
            const bottleneck = model.getLayer( "dropout" ); // This is the final layer before the conv_pred pre-trained classification layer
            const baseModel = tf.model({
                inputs: model.inputs,
                outputs: bottleneck.output
            });
            // Freeze the convolutional base
            for( const layer of baseModel.layers ) {
                layer.trainable = false;
            }
            // Add a classification head
            const newHead = tf.sequential();
            newHead.add( tf.layers.flatten( {
                inputShape: baseModel.outputs[ 0 ].shape.slice( 1 )
            } ) );
            newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( { units: 10, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( {
                units: labels.length,
                kernelInitializer: 'varianceScaling',
                useBias: false,
                activation: 'softmax'
            } ) );
            // Build the new model
            const newOutput = newHead.apply( baseModel.outputs[ 0 ] );
            const newModel = tf.model( { inputs: baseModel.inputs, outputs: newOutput } );
            return newModel;
        }

        async function trainModel() {
            hasTrained = false;
            setText( "Training..." );

            // Setup training data
            const imageSamples = [];
            const targetSamples = [];
            trainingData.forEach( sample => {
                imageSamples.push( sample.image );
                let cat = [];
                for( let c = 0; c < labels.length; c++ ) {
                    cat.push( c === sample.category ? 1 : 0 );
                }
                targetSamples.push( tf.tensor1d( cat ) );
            });
            const xs = tf.stack( imageSamples );
            const ys = tf.stack( targetSamples );

            // Train the model on new image samples
            model.compile( { loss: "meanSquaredError", optimizer: "adam", metrics: [ "acc" ] } );

            await model.fit( xs, ys, {
                epochs: 30,
                shuffle: true,
                callbacks: {
                    onEpochEnd: ( epoch, logs ) => {
                        console.log( "Epoch #", epoch, logs );
                    }
                }
            });
            hasTrained = true;
        }

        // Mobilenet v1 0.25 224x224 model
        const mobilenet = "https://storage.googleapis.com/tfjs-models/tfjs/mobilenet_v1_0.25_224/model.json";

        let model = null;
        let hasTrained = false;

        async function setupWebcam() {
            return new Promise( ( resolve, reject ) => {
                const webcamElement = document.getElementById( "webcam" );
                const navigatorAny = navigator;
                navigator.getUserMedia = navigator.getUserMedia ||
                navigatorAny.webkitGetUserMedia || navigatorAny.mozGetUserMedia ||
                navigatorAny.msGetUserMedia;
                if( navigator.getUserMedia ) {
                    navigator.getUserMedia( { video: true },
                        stream => {
                            webcamElement.srcObject = stream;
                            webcamElement.addEventListener( "loadeddata", resolve, false );
                        },
                    error => reject());
                }
                else {
                    reject();
                }
            });
        }

        async function getWebcamImage() {
            const img = ( await webcam.capture() ).toFloat();
            const normalized = img.div( 127 ).sub( 1 );
            return normalized;
        }

        async function captureSample( category ) {
            trainingData.push( {
                image: await getWebcamImage(),
                category: category
            });
            setText( "Captured: " + labels[ category ] );
        }

        let webcam = null;

        (async () => {
            // Load the model
            model = await tf.loadLayersModel( mobilenet );
            model = createTransferModel( model );
            await setupWebcam();
            webcam = await tf.data.webcam( document.getElementById( "webcam" ) );
            // Setup prediction every 200 ms
            setInterval( predictImage, 200 );
        })();
        </script>
    </body>
</html>

Detecting Hand Gestures

The starting point is built ready to detect four different categories: None, Rock, Paper, Scissors. You can try it using your webcam by clicking each of the category buttons to capture some photos (5-6 is a good sample to start with) while you are holding each hand gesture, and then clicking the train button to transfer learning to the neural network. After this, you can improve the model by taking more photos and clicking the train button again.

Additional Hand Gestures and Sign Language

As you can probably imagine, adding more categories becomes harder for the AI to learn and takes more time. However, the results are fun, and AI performs fairly well even from just a couple of photos for each category. Let’s try adding some American Sign Language (ASL) gestures.

To add more, you can include more buttons in the input list, updating the number passed into captureSample(), and modify the labels array accordingly.

You can add whichever signs you would like. I tried adding four that were part of the emoji set:

👌 (Letter D)
👍 (Thumb Up)
🖖 (Vulcan)
🤟 (ILY - I Love You)

Technical Footnotes

If AI does not seem to recognize your hand gestures well, try taking more photos and then training the model multiple times.
While training the model with the various hand gestures, keep in mind that it sees the full image; it doesn’t necessarily know that the hand by itself distinguishes the categories. It may be difficult to accurately recognize different hand gestures without numerous samples from different hands.
Sometimes, the model learns to differentiate between left and right hands, and sometimes it does not, which could affect predictions after multiple rounds of training.

Finish Line

For your reference, here is the full code for this project:

<html>
    <head>
        <meta charset="UTF-8">
        <title>Interpreting Hand Gestures and Sign Language in the Webcam with AI using TensorFlow.js</title>
        <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs@2.0.0/dist/tf.min.js"></script>
        <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs-data@2.0.0/dist/tf-data.min.js"></script>
        <style>
            img, video {
                object-fit: cover;
            }
        </style>
    </head>
    <body>
        <video autoplay playsinline muted id="webcam" width="224" height="224"></video>
        <div id="buttons">
            <button onclick="captureSample(0)">None</button>
            <button onclick="captureSample(1)">✊ (Rock)</button>
            <button onclick="captureSample(2)">🖐 (Paper)</button>
            <button onclick="captureSample(3)">✌️ (Scissors)</button>
            <button onclick="captureSample(4)">👌 (Letter D)</button>
            <button onclick="captureSample(5)">👍 (Thumb Up)</button>
            <button onclick="captureSample(6)">🖖 (Vulcan)</button>
            <button onclick="captureSample(7)">🤟 (ILY - I Love You)</button>
            <button onclick="trainModel()">Train</button>
        </div>
        <h1 id="status">Loading...</h1>
        <script>
        let trainingData = [];

        const labels = [
            "None",
            "✊ (Rock)",
            "🖐 (Paper)",
            "✌️ (Scissors)",
            "👌 (Letter D)",
            "👍 (Thumb Up)",
            "🖖 (Vulcan)",
            "🤟 (ILY - I Love You)"
        ];

        function setText( text ) {
            document.getElementById( "status" ).innerText = text;
        }

        async function predictImage() {
            if( !hasTrained ) { return; } // Skip prediction until trained
            const img = await getWebcamImage();
            let result = tf.tidy( () => {
                const input = img.reshape( [ 1, 224, 224, 3 ] );
                return model.predict( input );
            });
            img.dispose();
            let prediction = await result.data();
            result.dispose();
            // Get the index of the highest value in the prediction
            let id = prediction.indexOf( Math.max( ...prediction ) );
            setText( labels[ id ] );
        }

        function createTransferModel( model ) {
            // Create the truncated base model (remove the "top" layers, classification + bottleneck layers)
            const bottleneck = model.getLayer( "dropout" ); // This is the final layer before the conv_pred pre-trained classification layer
            const baseModel = tf.model({
                inputs: model.inputs,
                outputs: bottleneck.output
            });
            // Freeze the convolutional base
            for( const layer of baseModel.layers ) {
                layer.trainable = false;
            }
            // Add a classification head
            const newHead = tf.sequential();
            newHead.add( tf.layers.flatten( {
                inputShape: baseModel.outputs[ 0 ].shape.slice( 1 )
            } ) );
            newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( { units: 10, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( {
                units: labels.length,
                kernelInitializer: 'varianceScaling',
                useBias: false,
                activation: 'softmax'
            } ) );
            // Build the new model
            const newOutput = newHead.apply( baseModel.outputs[ 0 ] );
            const newModel = tf.model( { inputs: baseModel.inputs, outputs: newOutput } );
            return newModel;
        }

        async function trainModel() {
            hasTrained = false;
            setText( "Training..." );

            // Setup training data
            const imageSamples = [];
            const targetSamples = [];
            trainingData.forEach( sample => {
                imageSamples.push( sample.image );
                let cat = [];
                for( let c = 0; c < labels.length; c++ ) {
                    cat.push( c === sample.category ? 1 : 0 );
                }
                targetSamples.push( tf.tensor1d( cat ) );
            });
            const xs = tf.stack( imageSamples );
            const ys = tf.stack( targetSamples );

            // Train the model on new image samples
            model.compile( { loss: "meanSquaredError", optimizer: "adam", metrics: [ "acc" ] } );

            await model.fit( xs, ys, {
                epochs: 30,
                shuffle: true,
                callbacks: {
                    onEpochEnd: ( epoch, logs ) => {
                        console.log( "Epoch #", epoch, logs );
                    }
                }
            });
            hasTrained = true;
        }

        // Mobilenet v1 0.25 224x224 model
        const mobilenet = "https://storage.googleapis.com/tfjs-models/tfjs/mobilenet_v1_0.25_224/model.json";

        let model = null;
        let hasTrained = false;

        async function setupWebcam() {
            return new Promise( ( resolve, reject ) => {
                const webcamElement = document.getElementById( "webcam" );
                const navigatorAny = navigator;
                navigator.getUserMedia = navigator.getUserMedia ||
                navigatorAny.webkitGetUserMedia || navigatorAny.mozGetUserMedia ||
                navigatorAny.msGetUserMedia;
                if( navigator.getUserMedia ) {
                    navigator.getUserMedia( { video: true },
                        stream => {
                            webcamElement.srcObject = stream;
                            webcamElement.addEventListener( "loadeddata", resolve, false );
                        },
                    error => reject());
                }
                else {
                    reject();
                }
            });
        }

        async function getWebcamImage() {
            const img = ( await webcam.capture() ).toFloat();
            const normalized = img.div( 127 ).sub( 1 );
            return normalized;
        }

        async function captureSample( category ) {
            trainingData.push( {
                image: await getWebcamImage(),
                category: category
            });
            setText( "Captured: " + labels[ category ] );
        }

        let webcam = null;

        (async () => {
            // Load the model
            model = await tf.loadLayersModel( mobilenet );
            model = createTransferModel( model );
            await setupWebcam();
            webcam = await tf.data.webcam( document.getElementById( "webcam" ) );
            // Setup prediction every 200 ms
            setInterval( predictImage, 200 );
        })();
        </script>
    </body>
</html>

What’s Next?

This project showed you how to start training your own computer vision AI to recognize potentially unlimited gestures, objects, species of animals, or even types of foods. The rest is up to you; the future of deep learning and AI might start right within your browser.

I hope you enjoyed following along with these examples. And as you experiment with more ideas, don’t forget to have fun!