Reading image and video metadata in google drive folders recursively

2024, Aug 19    

Reading image and video metadata in google drive folders recursively

First create an OAuthClient using google cloud

Enable the Google Drive API

Navigate to “APIs & Services” > “Library”. Search for “Google Drive API” and enable it.

Create OAuth 2.0 Credentials

  • Go to “APIs & Services” > “Credentials”.
  • Click “Create Credentials” and select “OAuth client ID”.
  • Configure the consent screen. For development, you can set it as an “Internal” app, which will allow only users in your organization (or the account you’re using) to test it. If you choose “External,” you’ll need to verify the app for public use.
  • In the test user add the google email ID which you will use for authenticating the consent screen.

Python script to extract image and video metadata

We will utilize the pydrive2 library to access google drive contents. Here is ths script. When the script runs, first it will go to the app consent screen, you should log in to your gmail that you added in the previous configuration of the consent screen, then it should run. Make sure this same email access has at least read access to the google drive folder that you wish to crawl:

# %%
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from PIL import Image, ExifTags, TiffImagePlugin
import os
import subprocess
import json
import tempfile
from pprint import pprint
import csv
from fractions import Fraction
# %%
# Authenticate and create the PyDrive client.
gauth = GoogleAuth()
# Creates local webserver and automatically handles authentication.
gauth.LocalWebserverAuth()
drive = GoogleDrive(gauth)
# %%
def list_drive_files(folder_id):
    """List all files in the Google Drive root directory."""
    query = f"'{folder_id}' in parents and trashed=false"
    file_list = drive.ListFile({'q': query}).GetList()
    return file_list


def check_exif_metadata(file_id):
    """Check if the file has EXIF metadata."""
    file = drive.CreateFile({'id': file_id})
    file.FetchMetadata()
    mime_type = file['mimeType']
    
    # Create a temporary file
    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
        try:
            # Download the file to a temporary location
            file.GetContentFile(temp_file.name, mimetype=mime_type)

            # Check if the file is an image
            if mime_type.startswith('image/'):
                with Image.open(temp_file.name) as img:
                    exif_data = img._getexif()
                    if exif_data:
                        print(f"EXIF metadata found for file: {file['title']}")
                        exif = {
                            ExifTags.TAGS[k]: v
                            for k, v in exif_data.items()
                            if k in ExifTags.TAGS and type(v) not in [bytes, TiffImagePlugin.IFDRational]
                        }
                        return exif
                    else:
                        #print(f"No EXIF metadata found for file: {file['title']}")
                        return None
            else:
                return None
        finally:
            # Ensure the temporary file is removed
            temp_file.close()
            os.unlink(temp_file.name)
def check_video_metadata(file_id):
    """Check for metadata in video files using ffprobe."""
    file = drive.CreateFile({'id': file_id})
    file.FetchMetadata()

    # Create a temporary file
    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
        try:
            # Download the file to a temporary location
            file.GetContentFile(temp_file.name)

            # Run ffprobe command to extract metadata
            cmd = [
                'ffprobe', '-v', 'error', '-show_entries',
                'format_tags', '-print_format', 'json', temp_file.name
            ]
            result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            metadata = json.loads(result.stdout)
            
            # Check if there are any tags/metadata
            return metadata.get('format', {}).get('tags')
        except Exception as e:
            print(f"Error processing video file {temp_file.name}: {e}")
            return None
        finally:
            # Ensure the temporary file is removed
            temp_file.close()
            os.unlink(temp_file.name)

def convert_value(value):
    """Convert individual EXIF value to a JSON-serializable format."""
    if isinstance(value, IFDRational):
        return float(value)  # Convert IFDRational to float
    elif isinstance(value, bytes):
        return value.decode(errors="ignore")  # Decode bytes to string
    elif isinstance(value, tuple):
        return tuple(convert_value(v) for v in value)  # Recursively convert tuple elements
    elif isinstance(value, list):
        return [convert_value(v) for v in value]  # Recursively convert list elements
    elif isinstance(value, dict):
        return {k: convert_value(v) for k, v in value.items()}  # Recursively convert dict elements
    else:
        return value

def convert_exif_to_serializable(exif):
    """Recursively convert EXIF metadata to a JSON-serializable format."""
    return convert_value(exif)
# %%
# %%
# List files and check for EXIF metadata
folders =  ["PUT_YOUR_FOLDER_ID_HERE"]
with open("metdata_attack_on_hindu.csv","w") as f:
    writer = csv.writer(f)
    writer.writerow(["name","parent_dir_name","mime_type","gdrive_id","parent_gdrive_id","metadata"])
    while folders:
        current_folder = folders.pop()
        fold = drive.CreateFile({'id': current_folder})
        fold.FetchMetadata()
        current_folder_name = fold['title']
        print(f"{current_folder_name=}")
        files = list_drive_files(current_folder)
        print(f"There are total {len(files)} file or folder in this directory")
        for file in files:
            print(file["title"])
            file.FetchMetadata()
            mime_type = file['mimeType']
            print(mime_type)
            if "folder" in mime_type:
                print(f"{file['title']} is a folder")
                folders.append(file["id"])
                exif=None
            elif mime_type.startswith("image/"):
                print(file["id"])
                print("this file is an image")
                # if "imageMediaMetadata" in file:
                    # print(file["imageMediaMetadata"])
                    # if "location" in file["imageMediaMetadata"]:
                    #     print("location found in image")
                exif = check_exif_metadata(file["id"])
                pprint(exif)
            elif mime_type.startswith("video/"):
                print(file["id"])
                print("this file is a video")
                exif = check_video_metadata(file["id"])
                pprint(exif)
            else:
                print(file["id"])
                print("This file is neither image nor a video")
                exif = None
            writer.writerow([file["title"],fold["title"],mime_type,file["id"],fold["id"],json.dumps(convert_value(exif))])