From ef6cb2fafe7602126802f1d473d1fa4de27be04a Mon Sep 17 00:00:00 2001 From: Yotam Erel Date: Fri, 25 Aug 2023 12:45:53 +0900 Subject: [PATCH 1/2] adds mirror_annotation. change cmd flags to be consistent with common practices. --- src/icatcher/cli.py | 53 +++++------- src/icatcher/face_detector.py | 5 +- src/icatcher/options.py | 156 +++++++++++++++++----------------- src/icatcher/video.py | 2 +- tests/test_basic.py | 54 +++++++++++- 5 files changed, 159 insertions(+), 111 deletions(-) diff --git a/src/icatcher/cli.py b/src/icatcher/cli.py index 73d8f04..44c94d7 100644 --- a/src/icatcher/cli.py +++ b/src/icatcher/cli.py @@ -21,7 +21,6 @@ parallelize_face_detection, detect_face_opencv_dnn, ) -from pathos.helpers import cpu_count from batch_face import RetinaFace @@ -278,11 +277,12 @@ def create_output_streams(video_path, framerate, resolution, opt): prediction_output_file = Path( opt.output_annotation, video_path.stem + opt.output_file_suffix ) - if opt.output_format == "PrefLookTimestamp": - with open(prediction_output_file, "w", newline="") as f: # Write header - f.write( - "Tracks: left, right, away, codingactive, outofframe\nTime,Duration,TrackName,comment\n\n" - ) + if prediction_output_file.exists(): + if opt.overwrite: + prediction_output_file.unlink() + else: + raise FileExistsError("Annotation output file already exists. Use --overwrite flag to overwrite.") + return video_output_file, prediction_output_file, skip @@ -367,11 +367,7 @@ def predict_from_video(opt): last_class_text = "" # Initialize so that we see the first class assignment as an event to record # if going to use cpu parallelization, don't allow for live stream video - if use_cpu and opt.fd_model == "retinaface" and not opt.dont_buffer: - # figure out how many cpus can be used - num_cpus = cpu_count() - opt.num_cpus_saved - assert num_cpus > 0 - + if use_cpu and opt.fd_model == "retinaface" and opt.fd_parallel_processing: # send all frames in to be preprocessed and have faces detected prior to running gaze detection total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) vid_frames = range( @@ -384,9 +380,9 @@ def predict_from_video(opt): processed_frames[0].shape[0], processed_frames[0].shape[1], ) - logging.debug("face detection on buffered frames ...") + logging.info("performing face detection on buffered frames...") faces = parallelize_face_detection( - processed_frames, face_detector_model, num_cpus, opt + processed_frames, face_detector_model, opt.fd_num_cpus, opt ) del processed_frames @@ -413,7 +409,7 @@ def predict_from_video(opt): frames.append(frame) if ( - use_cpu and opt.fd_model == "retinaface" and not opt.dont_buffer + use_cpu and opt.fd_model == "retinaface" and opt.fd_parallel_processing ): # if using cpu, just pull from master bboxes = master_bboxes[frame_count] elif opt.fd_model == "opencv_dnn": @@ -528,6 +524,11 @@ def predict_from_video(opt): corrected_transitions, ) class_text = reverse_classes[answers[cursor]] + if opt.mirror_annotation: + if class_text == "left": + class_text = "right" + elif class_text == "right": + class_text = "left" if opt.on_off: class_text = "off" if class_text == "away" else "on" if opt.output_video_path: @@ -576,16 +577,6 @@ def predict_from_video(opt): confidences[cursor], ) ) - elif opt.output_format == "PrefLookTimestamp": - if ( - class_text != last_class_text - ): # Record "event" for change of direction if code has changed - frame_ms = int( - (frame_count + cursor + 1) * (1000.0 / framerate) - ) - with open(prediction_output_file, "a", newline="") as f: - f.write("{},0,{}\n".format(frame_ms, class_text)) - last_class_text = class_text logging.info( "frame: {}, class: {}, confidence: {:.02f}, cur_fps: {:.02f}".format( str(frame_count + cursor + 1), @@ -624,12 +615,14 @@ def cleanup( if opt.output_video_path: video_output_file.release() if opt.output_annotation: # write footer to file - if opt.output_format == "PrefLookTimestamp": - start_ms = int((1000.0 / framerate) * (opt.sliding_window_size // 2)) - end_ms = int((1000.0 / framerate) * frame_count) - with open(prediction_output_file, "a", newline="") as f: - f.write("{},{},codingactive\n".format(start_ms, end_ms)) - elif opt.output_format == "compressed": + if opt.output_format == "compressed": + answers = np.array(answers) + confidences = np.array(confidences) + if opt.mirror_annotation: + lefts = answers == classes["left"] + rights = answers == classes["right"] + answers[lefts] = classes["right"] + answers[rights] = classes["left"] np.savez(prediction_output_file, answers, confidences) cap.release() diff --git a/src/icatcher/face_detector.py b/src/icatcher/face_detector.py index b842a6e..b0ecf9e 100644 --- a/src/icatcher/face_detector.py +++ b/src/icatcher/face_detector.py @@ -4,6 +4,7 @@ from pathos.pools import ProcessPool from icatcher import draw import logging +from tqdm import tqdm def threshold_faces(all_faces: list, confidence_threshold: float): @@ -60,12 +61,10 @@ def process_frames(cap, frames, h_start_at, h_end_at, w_start_at, w_end_at): :param h_end_at: optional crop coordinate :param w_start_at: optional crop coordinate :param w_end_at: optional crop coordinate - :param v :return: list of images corresponding to video frames """ processed_frames = [] - for frame in frames: - logging.debug("buffering frames {}/{}".format(frame, len(frames))) + for frame in tqdm(frames, desc="buffering frames"): cap.set(cv2.CAP_PROP_POS_FRAMES, frame) ret, image = cap.read() if ret: diff --git a/src/icatcher/options.py b/src/icatcher/options.py index 2c1958e..876c293 100644 --- a/src/icatcher/options.py +++ b/src/icatcher/options.py @@ -1,7 +1,7 @@ import argparse from pathlib import Path from . import version - +from pathos.helpers import cpu_count def parse_arguments(my_string=None): """ @@ -13,7 +13,7 @@ def parse_arguments(my_string=None): parser.add_argument( "source", type=str, - help="the source to use (path to video file, folder or webcam id)", + help="The source to use (path to video file, folder or webcam id).", ) parser.add_argument( "--model", @@ -25,12 +25,19 @@ def parse_arguments(my_string=None): "icatcher+_bw-cali.pth", "icatcher+_senegal.pth", ], - help="model file that will be used for gaze detection", + help="Model file that will be used for gaze detection.", + ) + parser.add_argument( + "--fd_model", + type=str, + choices=["retinaface", "opencv_dnn"], + default="retinaface", + help="The face detector model used. opencv_dnn may be more suitable for cpu usage if speed is priority over accuracy.", ) parser.add_argument( "--use_fc_model", action="store_true", - help="if supplied, will use face classifier " + help="If supplied, will use face classifier " "to decide which crop to use from every frame.", ) parser.add_argument( @@ -42,21 +49,21 @@ def parse_arguments(my_string=None): "face_classifier_cali-bw.pth", "face_classifier_senegal.pth", ], - help="face classifier model file that will be used for deciding " - "which crop should we select from every frame. ", + help="Face classifier model file that will be used for deciding " + "which crop should we select from every frame.", ) parser.add_argument( "--source_type", type=str, default="file", choices=["file", "webcam"], - help="selects source of stream to use.", + help="Selects source of stream to use.", ) parser.add_argument( "--crop_percent", type=int, default=0, - help="A percent to crop video frames to prevent other people from appearing", + help="A percent to crop video frames to prevent other people from appearing.", ) parser.add_argument( "--crop_mode", @@ -64,61 +71,47 @@ def parse_arguments(my_string=None): choices=["top", "left", "right"], nargs="+", default=["top"], - help="where to crop video from, multi-choice.", + help="Where to crop video from, multi-choice.", ) parser.add_argument( - "--track_face", + "--show_output", action="store_true", - help="if detection is lost, will keep track of face using last known position.", + help="Show results online in a separate window.", ) parser.add_argument( - "--show_output", - action="store_true", - help="show results online in a separate window", + "--output_annotation", type=str, help="Folder to output annotations to." ) parser.add_argument( - "--output_annotation", type=str, help="folder to output annotations to" + "--overwrite", action="store_true", + help="If an output annotation file exists, will overwrite it. Without this flag iCatcher+ will terminate upon encountering an existing annotation file." ) parser.add_argument( "--on_off", action="store_true", - help="left/right/away annotations will be swapped with on/off (only works with icatcher+)", + help="Left/right/away annotations will be swapped with on/off.", + ) + parser.add_argument( + "--mirror_annotation", + action="store_true", + help="Left will be swapped with right, and right will be swapped with left.", ) parser.add_argument( "--output_format", type=str, default="raw_output", - choices=["raw_output", "compressed", "PrefLookTimestamp"], - ) # https://osf.io/3n97m/ - PrefLookTimestamp coding standard + choices=["raw_output", "compressed"], + ) parser.add_argument( "--output_video_path", - help="if present, annotated video will be saved to this folder", + help="If present, annotated video will be saved to this folder.", ) parser.add_argument( "--pic_in_pic", action="store_true", - help="if present, a mini picture with detection will be shown in the output video", - ) - parser.add_argument( - "--output_file_suffix", type=str, default=".txt", help="the output file suffix" - ) - parser.add_argument( - "--image_size", - type=int, - default=100, - help="All images will be resized to this size", - ) - parser.add_argument( - "--sliding_window_size", - type=int, - default=9, - help="Number of frames in rolling window of each datapoint", + help="If present, a mini picture with detections will be shown in the output video.", ) parser.add_argument( - "--window_stride", - type=int, - default=2, - help="Stride between frames in rolling window", + "--output_file_suffix", type=str, default=".txt", help="The output file suffix." ) parser.add_argument( "--per_channel_mean", @@ -126,7 +119,7 @@ def parse_arguments(my_string=None): metavar=("Channel1_mean", "Channel2_mean", "Channel3_mean"), type=float, default=[0.485, 0.456, 0.406], - help="supply custom per-channel mean of data for normalization", + help="Supply custom per-channel mean of data for normalization.", ) parser.add_argument( "--per_channel_std", @@ -134,80 +127,83 @@ def parse_arguments(my_string=None): metavar=("Channel1_std", "Channel2_std", "Channel3_std"), type=float, default=[0.229, 0.224, 0.225], - help="supply custom per-channel std of data for normalization", + help="Supply custom per-channel std of data for normalization.", ) parser.add_argument( "--gpu_id", type=int, default=-1, help="GPU id to use, use -1 for CPU." ) parser.add_argument("--log", help="If present, writes log to this path") parser.add_argument( - "-v", "--verbosity", type=str, choices=["debug", "info", "warning"], default="info", - help="Selects verbosity level", + help="Selects verbosity level.", ) parser.add_argument( "--video_filter", type=str, - help="provided file will be used to filter only test videos," - " will assume certain file structure using the lookit/cali-bw/senegal datasets", - ) - parser.add_argument( - "--raw_dataset_path", - type=str, - help="path to raw dataset (required if --video_filter is passed", - ) - parser.add_argument( - "--raw_dataset_type", - type=str, - choices=["lookit", "cali-bw", "senegal", "generic"], - default="lookit", - help="the type of dataset to preprocess", + help="Provided file will be used to filter only test videos," + " will assume certain file structure using the lookit/cali-bw/senegal datasets.", ) parser.add_argument( "--illegal_transitions_path", type=str, - help="path to CSV with illegal transitions to 'smooth' over", + help="Path to CSV with illegal transitions to 'smooth' over.", ) parser.add_argument("--version", action="version", version="%(prog)s " + version) # face detection options: - parser.add_argument( - "--fd_model", - type=str, - choices=["retinaface", "opencv_dnn"], - default="retinaface", - help="the face detector model used. opencv_dnn may be more suitable for cpu usage if speed is priority over accuracy", - ) parser.add_argument( "--fd_confidence_threshold", type=float, - help="the score confidence threshold that needs to be met for a face to be detected", + help="The score confidence threshold that needs to be met for a face to be detected.", + ) + parser.add_argument( + "--fd_parallel_processing", + action="store_true", + default=False, + help="(cpu, retinaface only) face detection will be parallelized, by batching the frames (requires buffering them), increasing memory usage, but decreasing overall processing time. Disallows live stream of results.", ) parser.add_argument( - "--num_cpus_saved", + "--fd_num_cpus", type=int, - default=0, - help="(retinaface only) amount of cpus to not use in parallel processing of face detection", + default=-1, + help="(cpu, retinaface only) amount of cpus to use if face detection parallel processing is true (-1: use all available cpus)).", ) parser.add_argument( "--fd_batch_size", type=int, default=16, - help="(retinaface only) amount of frames fed into face detector at one time for batch inference", + help="(cpu, retinaface only) amount of frames fed at once into face detector if parallel processing is true.", ) parser.add_argument( "--fd_skip_frames", type=int, default=0, - help="(cpu only) amount of frames to skip between each face detection. previous bbox will be used", + help="(cpu, retinaface only) amount of frames to skip between each face detection if parallel processing is true. previous bbox will be used.", ) parser.add_argument( - "--dont_buffer", + "--track_face", action="store_true", - default=False, - help="(cpu, retinaface only) frames will not be buffered, decreasing memory usage, but increasing processing time. Allows live stream of results.", + help="If detection is lost, will keep track of face using last known position. WARNING: untested experimental feature.", + ) + parser.add_argument( + "--image_size", + type=int, + default=100, + help="All images will be resized to this size. WARNING: changing default results in untested behavior.", + ) + parser.add_argument( + "--sliding_window_size", + type=int, + default=9, + help="Number of frames in rolling window of each datapoint. WARNING: changing default results in untested behavior.", + ) + parser.add_argument( + "--window_stride", + type=int, + default=2, + help="Stride between frames in rolling window. WARNING: changing default results in untested behavior.", ) if my_string is not None: args = parser.parse_args(my_string.split()) @@ -231,8 +227,6 @@ def parse_arguments(my_string=None): args.video_filter = Path(args.video_filter) if not args.video_filter.is_file() and not args.video_filter.is_dir(): raise FileNotFoundError("Video filter is not a file or a folder") - if args.raw_dataset_path: - args.raw_dataset_path = Path(args.raw_dataset_path) if args.output_annotation: args.output_annotation = Path(args.output_annotation) args.output_annotation.mkdir(exist_ok=True, parents=True) @@ -261,4 +255,14 @@ def parse_arguments(my_string=None): if not torch.cuda.is_available(): raise ValueError("GPU is not available. Was torch compiled with CUDA?") + # figure out how many cpus can be used + use_cpu = True if args.gpu_id == -1 else False + if use_cpu: + if args.fd_num_cpus == -1: + args.fd_num_cpus = cpu_count() + else: + if args.fd_num_cpus > cpu_count(): + raise ValueError( + "Number of cpus requested is greater than available cpus" + ) return args diff --git a/src/icatcher/video.py b/src/icatcher/video.py index 3f37533..aa92db9 100644 --- a/src/icatcher/video.py +++ b/src/icatcher/video.py @@ -163,5 +163,5 @@ def get_video_paths(opt): ) else: # video_paths = [int(opt.source)] - raise NotImplementedError + raise NotImplementedError("sources other than video file or folder of videos are not currently supported.") return video_paths diff --git a/tests/test_basic.py b/tests/test_basic.py index 8a5ff57..82296bc 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,10 +1,13 @@ import pytest import numpy as np import icatcher +from icatcher.cli import predict_from_video from pathlib import Path - def test_parse_illegal_transitions(): + """ + tests handling the option "illegal transitions". + """ bad_path1 = Path("tests/test_data/illegal_transitions_bad1.csv") bad_path2 = Path("tests/test_data/illegal_transitions_bad2.csv") bad_path3 = Path("tests/test_data/illegal_transitions_bad3.csv") @@ -25,6 +28,9 @@ def test_parse_illegal_transitions(): def test_process_video(): + """ + tests processing a video file. + """ arguments = "tests/test_data/test.mp4" opt = icatcher.options.parse_arguments(arguments) source = Path(opt.source) @@ -41,6 +47,52 @@ def test_process_video(): def test_mask(): + """ + tests masking an image. + """ image = np.random.random((256, 512, 3)) masked = icatcher.draw.mask_regions(image, 0, 128, 0, 256) assert masked[:128, 256:, :].all() == 0 + + +@pytest.mark.parametrize( + "args_string", + [ + "tests/test_data/test.mp4 --model icatcher+_lookit.pth --fd_model opencv_dnn --output_annotation tests/test_data --overwrite", + "tests/test_data/test.mp4 --model icatcher+_lookit.pth --fd_model opencv_dnn --output_annotation tests/test_data", + "tests/test_data/test.mp4 --model icatcher+_lookit.pth --fd_model opencv_dnn --output_annotation tests/test_data --mirror_annotation --overwrite", + "tests/test_data/test.mp4 --model icatcher+_lookit.pth --fd_model opencv_dnn --output_annotation tests/test_data --output_format compressed --overwrite", + "tests/test_data/test.mp4 --model icatcher+_lookit.pth --fd_model opencv_dnn --output_annotation tests/test_data --mirror_annotation --output_format compressed --overwrite", + ], +) +def test_predict_from_video(args_string): + """ + runs entire prediction pipeline with several command line options. + """ + args = icatcher.options.parse_arguments(args_string) + if not args.overwrite: + try: + predict_from_video(args) + except FileExistsError: # should be raised if overwrite is False and file exists, which is expected since this is the second test + return + else: + predict_from_video(args) + if args.output_annotation: + if args.output_format == "compressed": + output_file = Path("tests/test_data/test.npz") + data = np.load(output_file) + predicted_classes = data["arr_0"] + confidences = data["arr_1"] + else: + output_file = Path("tests/test_data/test.txt") + with open(output_file, "r") as f: + data = f.readlines() + predicted_classes = [x.split(",")[1].strip() for x in data] + predicted_classes = np.array([icatcher.classes[x] for x in predicted_classes]) + confidences = np.array([float(x.split(",")[2].strip()) for x in data]) + assert len(predicted_classes) == len(confidences) + # assert len(predicted_classes) == 194 # hard coded number of frames in test video + if args.mirror_annotation: + assert (predicted_classes == 2).all() + else: + assert (predicted_classes == 1).all() \ No newline at end of file From c8aec7a0069ced30a5a22a0a3047ee3905a75d15 Mon Sep 17 00:00:00 2001 From: Yotam Erel Date: Fri, 25 Aug 2023 12:51:34 +0900 Subject: [PATCH 2/2] resolves #62 --- pyproject.toml | 4 ++-- src/icatcher/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4df5dee..c8f987f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "icatcher" -version = "0.1.2" +version = "0.2.0" description = "iCatcher+: Robust and automated annotation of infant gaze from videos collected in laboratory, field, and online studies." readme = "README.md" authors = [{ name = "Yotam Erel", email = "erelyotam@gmail.com" }] @@ -39,7 +39,7 @@ dev = ["pytest"] Homepage = "https://github.com/yoterel/icatcher_plus" [tool.bumpver] -current_version = "0.1.2" +current_version = "0.2.0" version_pattern = "MAJOR.MINOR.PATCH" commit_message = "bump version {old_version} -> {new_version}" commit = false diff --git a/src/icatcher/__init__.py b/src/icatcher/__init__.py index ed870fb..e7d4644 100644 --- a/src/icatcher/__init__.py +++ b/src/icatcher/__init__.py @@ -1,5 +1,5 @@ ### define version -__version__ = "0.1.2" +__version__ = "0.2.0" version = __version__ ### define classes classes = {"noface": -2, "nobabyface": -1, "away": 0, "left": 1, "right": 2}