Эх сурвалжийг харах

Updated code to work with yolo_det,pplite_seg,new_decoding_model

Tricolops 3 өдөр өмнө
parent
commit
0446931faf

+ 1 - 0
examples/bbox_detection/labels.txt

@@ -1,3 +1,4 @@
 __ignore__
 _background_
 barcode
+package

+ 68 - 54
labelme/ai/barcode_decode.py

@@ -24,21 +24,21 @@ class CodeSet:
     B = 2
     C = 3
 
-class Normalize:
-    def __init__(self, mean=(0.45, 0.45, 0.45), std=(0.24, 0.24, 0.24)):
-        if not (isinstance(mean, (list, tuple)) and isinstance(std, (list, tuple))):
-            raise ValueError("mean and std should be of type list or tuple.")
-        self.mean = np.array(mean, dtype=np.float32)
-        self.std = np.array(std, dtype=np.float32)
-
-        # Reshape for broadcasting to apply mean and std across the spatial dimensions of an image
-        self.mean = self.mean.reshape((1, 1, 3))
-        self.std = self.std.reshape((1, 1, 3))
-
-    def __call__(self, img):
-        img = img.astype(np.float32) / 255.0  # Scale pixel values to [0, 1]
-        img = (img - self.mean) / self.std  # Normalize
-        return img
+# class Normalize:
+#     def __init__(self, mean=(0.45, 0.45, 0.45), std=(0.25, 0.25, 0.25)):
+#         if not (isinstance(mean, (list, tuple)) and isinstance(std, (list, tuple))):
+#             raise ValueError("mean and std should be of type list or tuple.")
+#         self.mean = np.array(mean, dtype=np.float32)
+#         self.std = np.array(std, dtype=np.float32)
+
+#         # Reshape for broadcasting to apply mean and std across the spatial dimensions of an image
+#         self.mean = self.mean.reshape((1, 1, 3))
+#         self.std = self.std.reshape((1, 1, 3))
+
+#     def __call__(self, img):
+#         img = img.astype(np.float32) / 255.0  # Scale pixel values to [0, 1]
+#         img = (img - self.mean) / self.std  # Normalize
+#         return img
 
 class BarcodeDecodeModel:
     def __init__(self, decoding_model_path=None):
@@ -52,8 +52,8 @@ class BarcodeDecodeModel:
             self.decoding_net = self.ie.read_model(model=decoding_model_path)
             self.decoding_sess = self.ie.compile_model(model=self.decoding_net, device_name="CPU")
 
-        self.decoding_input_shape = (1, 3, 32, 256)
-        self.normalize = Normalize()  # Normalization instance
+        self.decoding_input_shape = (1, 3, 32, 512)
+        # self.normalize = Normalize()  # Normalization instance
         self._lock = threading.Lock()
         self._image_embedding_cache = collections.OrderedDict()
         self._max_cache_size = 10
@@ -70,19 +70,23 @@ class BarcodeDecodeModel:
     #     self.pixmap = pixmap
     #     logger.debug("Pixmap set successfully in BarcodeDecodeModel.")
 
-    def preprocess_image(self, normalized_img):
-        norm = Normalize(mean=(0.45, 0.45, 0.45), std=(0.25, 0.25, 0.25))
-        resized_image = cv2.resize(normalized_img, (self.decoding_input_shape[3], self.decoding_input_shape[2]))
-        resized_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2RGB)
-        resized_image = norm(resized_image)
-        
-        # Resize image for detection model input size
-        logger.debug(f"Preprocessing image for detection: {normalized_img.shape}")
-        # resized_image = resized_image.astype('float32') / 255.0
-
-        input_tensor = resized_image.transpose(2, 0, 1)  # Convert HWC to CHW
-        input_tensor = np.expand_dims(input_tensor, 0)  # Add batch dimension
+    def preprocess_image(self, img):
+        logger.debug(f"Preprocessing image for decoding: {img.shape}")
+        img = cv2.resize(
+            img,
+            (self.decoding_input_shape[3], self.decoding_input_shape[2]),
+            interpolation=cv2.INTER_NEAREST
+        )
+        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+        img = img.astype("float32") / 255.0
+        mean = np.array([0.45, 0.45, 0.45], dtype=np.float32)
+        std = np.array([0.25, 0.25, 0.25], dtype=np.float32)
+        img = (img - mean) / std
+        img = img.transpose(2, 0, 1)
+        img = img.astype("float32")
+        input_tensor = np.expand_dims(img, 0)
         logger.debug(f"Processed image shape: {input_tensor.shape}")
+
         return input_tensor
     
     def decode_from_points(self, points, detection_idx, original_image):
@@ -95,33 +99,34 @@ class BarcodeDecodeModel:
             str: Decoded text from the decoding model.
         """
         try:
-
             # Convert scaled_points to a numpy array
             polygon = np.array(points, dtype=np.int32)
 
             # Create a mask of the same size as the original image
             # original_image = labelme.utils.img_qt_to_arr(self.pixmap.toImage())
             # cv2.imwrite(f"original_image{detection_idx + 1}.png", original_image)
-            mask = np.zeros(original_image.shape[:2], dtype=np.uint8)
-            cv2.fillPoly(mask, [polygon], 255)  # Fill the polygon with white
+
+            # mask = np.zeros(original_image.shape[:2], dtype=np.uint8)
+            # cv2.fillPoly(mask, [polygon], 255)  # Fill the polygon with white
 
             # Apply the mask to the original image
-            masked_image = cv2.bitwise_and(original_image, original_image, mask=mask)
+            # masked_image = cv2.bitwise_and(original_image, original_image, mask=mask)
+            # cv2.imwrite(f"masked_image{detection_idx + 1}.png", masked_image)
 
             # Get the bounding rectangle of the polygon to crop the ROI
             x, y, w, h = cv2.boundingRect(polygon)
-            cropped_image_dec = masked_image[y:y+h, x:x+w]
-
+            # cropped_image_dec = masked_image[y:y+h, x:x+w]
             # cv2.imwrite(f"cropped_exact_{detection_idx + 1}.png", cropped_image_dec)
-            logger.debug(f"cropped_exact image saved at  {detection_idx + 1}.")
-             
+            # logger.debug(f"cropped_exact image saved at  {detection_idx + 1}.")
+
             src_points = np.float32(points)
+            
 
             # Calculate the width and height of the barcode based on scaled_points
-            width = int(np.linalg.norm(src_points[0] - src_points[1]))
-            # print(width)
-            height = int(np.linalg.norm(src_points[1] - src_points[2]))
-            # print(height)
+            width = int(np.ceil(np.linalg.norm(src_points[0] - src_points[1])))
+            print(f" width:{ width}")
+            height = int(np.ceil(np.linalg.norm(src_points[1] - src_points[2])))
+            print(f" height:{ height}")
 
             # Correct width/height if needed
             if width < height:
@@ -133,23 +138,32 @@ class BarcodeDecodeModel:
                     src_points[3],  # Bottom-right becomes bottom-left
                     src_points[0]   # Bottom-left becomes top-left
                 ])
+            print(f"src_points:{src_points}")
 
             # Define destination points for the flattened barcode
             dst_points = np.float32([
                 [0, 0],
-                [width - 1, 0],
-                [width - 1, height - 1],
-                [0, height - 1]
+                [width, 0],
+                [width, height],
+                [0, height]
             ])
 
             # Calculate the perspective transformation matrix
-            M = cv2.getPerspectiveTransform(src_points, dst_points)
-
+            M = cv2.getPerspectiveTransform(src_points, dst_points)            
+            print(f" M:{ M}")
+            # t_start = time.perf_counter()
+            
             # Apply the perspective transformation
-            aligned_barcode = cv2.warpPerspective(original_image, M, (width, height), flags=cv2.INTER_LINEAR)
+            aligned_barcode = cv2.warpPerspective(original_image, M, (width, height), flags=cv2.INTER_LANCZOS4)
+            
+            # t_end = time.perf_counter()
+            # print(
+            #     f"[TIMING] Total crop+align time: {(t_end - t_start) * 1000:.3f} ms | "
+            #     f"aligned_size=({height},{width})"
+            # )
 
             # Save the aligned barcode image
-            # cv2.imwrite(f"decoding_barcode_{detection_idx + 1}.png", aligned_barcode)
+            # cv2.imwrite(f"old_aligned_barcode_decoding_barcode_{detection_idx + 1}.png", aligned_barcode)
             # logger.debug(f"Aligned barcode saved at  {detection_idx + 1}.")
 
             # Normalize the image to scale pixel intensities to the range [0, 255]
@@ -157,9 +171,9 @@ class BarcodeDecodeModel:
             cv2.normalize(aligned_barcode, normalized_img, 0, 255, cv2.NORM_MINMAX)
             logger.debug("Image normalized.")
 
-            # Save the cropped image
-            cv2.imwrite(f"cropped_image_decoding_normalized{detection_idx + 1}.png",normalized_img)
-            logger.debug(f"Saved normalized image for decoding : {detection_idx + 1}")
+            # # Save the cropped image
+            cv2.imwrite(f"cropped_image__normalized{detection_idx + 1}.png",normalized_img)
+            # logger.debug(f"Saved normalized image for decoding : {detection_idx + 1}")
 
             # Run decoding model
             # confidence = None 
@@ -195,10 +209,10 @@ class BarcodeDecodeModel:
         decode_result = self.decoding_sess.infer_new_request({'x': preprocessed_img})
         output_tensor = decode_result['save_infer_model/scale_0.tmp_0']
         logger.debug(f"Output tensor shape: {output_tensor.shape}")
-        print(f"decode_result: {decode_result}")
+        # print(f"decode_result: {decode_result}")
         output_indices_batch = np.argmax(output_tensor, axis=2)
         output_probs_batch = np.max(output_tensor, axis=2)
-        print(f"output_probs_batch:{output_probs_batch}")
+        # print(f"output_probs_batch:{output_probs_batch}")
         # Decode text from indices
 
         def preprocess_output_indices(output_indices_batch, output_probs_batch):
@@ -206,7 +220,7 @@ class BarcodeDecodeModel:
             if output_indices_batch is None or len(output_indices_batch) == 0:
                 return False, "Empty output indices batch", None
 
-            print(f"output_indices_batch: {output_indices_batch}")
+            # print(f"output_indices_batch: {output_indices_batch}")
             first_row = output_indices_batch[0]
             first_row_probs = output_probs_batch[0]
             if first_row is None or len(first_row) == 0:

+ 104 - 60
labelme/ai/barcode_detect.py

@@ -17,95 +17,129 @@ class Normalize:
         self.mean = np.array(mean, dtype=np.float32)
         self.std = np.array(std, dtype=np.float32)
 
-        # Reshape for broadcasting to apply mean and std across the spatial dimensions of an image
         self.mean = self.mean.reshape((1, 1, 3))
         self.std = self.std.reshape((1, 1, 3))
 
     def __call__(self, img):
-        img = img.astype(np.float32) / 255.0  # Scale pixel values to [0, 1]
-        img = (img - self.mean) / self.std  # Normalize
+        img = img.astype(np.float32) / 255.0
+        img = (img - self.mean) / self.std
         return img
 
+
 class BarcodeDetectModel:
     def __init__(self, detection_model_path, segmentation_model_path=None):
+
         self.ie = ov.Core()
 
-        # Load detection model
         self.detection_net = self.ie.read_model(model=detection_model_path)
-        self.detection_sess = self.ie.compile_model(model=self.detection_net, device_name="CPU")
+        self.detection_sess = self.ie.compile_model(self.detection_net, "CPU")
         self.detection_request = self.detection_sess.create_infer_request()
-        
-        # Load segmentation model if provided
+
         self.segmentation_net = None
         self.segmentation_sess = None
+
         if segmentation_model_path:
             self.segmentation_net = self.ie.read_model(model=segmentation_model_path)
-            self.segmentation_sess = self.ie.compile_model(model=self.segmentation_net, device_name="CPU")
-        
+            self.segmentation_sess = self.ie.compile_model(self.segmentation_net, "CPU")
+
         self._lock = threading.Lock()
-        self.input_height = 800  # Input shape for detection model (example size)
-        self.input_width = 800
-        self.segmentation_input_shape = (1, 3, 160, 320)  # Input shape for segmentation model
+
+        self.input_height = 1024
+        self.input_width = 1024
+
+        self.segmentation_input_shape = (1, 1, 192, 320)
+
         self._image_embedding_cache = collections.OrderedDict()
         self._max_cache_size = 10
-        self.normalize = Normalize()  # Normalization instance
+
+        self.normalize = Normalize()
 
     def set_image(self, image: np.ndarray):
+
         with self._lock:
+
             self.raw_width = image.shape[1]
             self.raw_height = image.shape[0]
-            # Preprocess the image
+
             input_tensor = self.preprocess_image(image)
             self._image = input_tensor
-            # Prepare other inputs
-            self._im_shape = np.array([[self.raw_height, self.raw_width]], dtype=np.float32)
-            self._scale_factor = np.array([[1.0, 1.0]], dtype=np.float32)
+
             self._thread = threading.Thread(
                 target=self._compute_and_cache_image_embedding
             )
             self._thread.start()
 
     def preprocess_image(self, image, for_segmentation=False):
+
         if for_segmentation:
-            # Resize image to segmentation model input size
+
             logger.debug(f"Preprocessing image for segmentation: {image.shape}")
+
             norm = Normalize(mean=(0.5,0.5,0.5), std=(0.5,0.5,0.5))
-            resized_image = cv2.resize(image, (self.segmentation_input_shape[3], self.segmentation_input_shape[2]))  # Width, Height
+
+            resized_image = cv2.resize(
+                image,
+                (self.segmentation_input_shape[3], self.segmentation_input_shape[2])
+            )
+
             resized_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2RGB)
-            resized_image = norm(resized_image)  # Normalize for segmentation model
+            resized_image = norm(resized_image)
+
         else:
-            # Resize image for detection model input size
+
             logger.debug(f"Preprocessing image for detection: {image.shape}")
-            resized_image = cv2.resize(image, (self.input_width, self.input_height))
-            resized_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2RGB)
-            resized_image = self.normalize(resized_image)
-            # resized_image = resized_image.astype('float32') / 255.0
-        input_tensor = resized_image.transpose(2, 0, 1)  # Convert HWC to CHW
-        input_tensor = np.expand_dims(input_tensor, 0)  # Add batch dimension
-        # logger.debug(f"Processed image shape: {input_tensor.shape}")
+
+            h, w = image.shape[:2]
+
+            r = min(self.input_height / h, self.input_width / w)
+
+            nh, nw = int(round(h * r)), int(round(w * r))
+
+            image_resized = cv2.resize(image, (nw, nh), interpolation=cv2.INTER_LINEAR)
+
+            top = (self.input_height - nh) // 2
+            bottom = self.input_height - nh - top
+            left = (self.input_width - nw) // 2
+            right = self.input_width - nw - left
+
+            image_padded = cv2.copyMakeBorder(
+                image_resized,
+                top, bottom, left, right,
+                cv2.BORDER_CONSTANT,
+                value=(114,114,114)
+            )
+
+            # store for scaling
+            self._letterbox_gain = r
+            self._letterbox_pad = (left, top)
+
+            resized_image = cv2.cvtColor(image_padded, cv2.COLOR_BGR2RGB)
+            resized_image = resized_image.astype(np.float32) / 255.0
+
+        input_tensor = resized_image.transpose(2, 0, 1)
+        input_tensor = np.expand_dims(input_tensor, 0)
+
         return input_tensor
 
     def _compute_and_cache_image_embedding(self):
+
         with self._lock:
-            # Prepare the inputs dictionary
-            inputs = {
-                'image': self._image,
-                'im_shape': self._im_shape,
-                'scale_factor': self._scale_factor
-            }
-            # Perform inference
+
+            inputs = [self._image]
+
             self._result = self.detection_request.infer(inputs)
-            # print("models results:", self._result)
 
     def _get_image_embedding(self):
+
         if self._thread is not None:
             self._thread.join()
             self._thread = None
+
         with self._lock:
-            new_result = self._result
-            return new_result
+            return self._result
 
     def predict_mask_from_points(self,points=None,point_labels=None):
+
         return self._collect_result_from_output(
             outputs=self._get_image_embedding(),
             raw_width=self.raw_width,
@@ -113,36 +147,46 @@ class BarcodeDetectModel:
         )
 
     def predict_polygon_from_points(self,points=None,point_labels=None):
+
         result_list=self.predict_mask_from_points(points,point_labels)
         return result_list
 
     def _collect_result_from_output(self, outputs, raw_width, raw_height):
-        # Extract the desired output array from outputs dictionary
-        output_array = None
-        for key in outputs:
-            if 'save_infer_model/scale_0.tmp_0' in key.names:
-                output_array = outputs[key]
-                break
-        if output_array is None:
-            raise ValueError("Desired output not found in outputs")
-
-        outputs = output_array  # shape [50,6]
+
+        output_array = list(outputs.values())[0]
+
+        outputs = output_array[0]  # (300,6)
+
         point_list = []
+
         thresh_hold = 0.5
 
+        gain = self._letterbox_gain
+        pad_x, pad_y = self._letterbox_pad
+
         for bbox_info in outputs:
-            score = bbox_info[1]
+
+            score = float(bbox_info[4])
+
             if score > thresh_hold:
-                x1_raw = bbox_info[2]
-                y1_raw = bbox_info[3]
-                x2_raw = bbox_info[4]
-                y2_raw = bbox_info[5]
-                # print(f"Raw bbox coordinates: x1={x1_raw}, y1={y1_raw}, x2={x2_raw}, y2={y2_raw}")
-                x1 = max(min(int(x1_raw), raw_width - 1), 0)
-                y1 = max(min(int(y1_raw), raw_height - 1), 0)
-                x2 = max(min(int(x2_raw), raw_width - 1), 0)
-                y2 = max(min(int(y2_raw), raw_height - 1), 0)
-                # print(f"Clamped bbox coordinates: x1={x1}, y1={y1}, x2={x2}, y2={y2}")
+
+                x1_raw = bbox_info[0]
+                y1_raw = bbox_info[1]
+                x2_raw = bbox_info[2]
+                y2_raw = bbox_info[3]
+
+                x1 = (x1_raw - pad_x) / gain
+                y1 = (y1_raw - pad_y) / gain
+                x2 = (x2_raw - pad_x) / gain
+                y2 = (y2_raw - pad_y) / gain
+
+                x1 = max(min(int(x1), raw_width - 1), 0)
+                y1 = max(min(int(y1), raw_height - 1), 0)
+                x2 = max(min(int(x2), raw_width - 1), 0)
+                y2 = max(min(int(y2), raw_height - 1), 0)
+
                 point_xy = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
+
                 point_list.append(point_xy)
-        return point_list
+
+        return point_list

+ 4 - 4
labelme/widgets/canvas.py

@@ -1117,7 +1117,7 @@ class Canvas(QtWidgets.QWidget):
         # logger.debug(f"scaled points: {scaled_points}")
         return scaled_points
     
-    def expand_bbox(self, x_min, y_min, x_max, y_max, factor=1.5):
+    def expand_bbox(self, x_min, y_min, x_max, y_max, factor=1.3):
         """
         Expands the bounding box by a given factor.
         """
@@ -1200,7 +1200,7 @@ class Canvas(QtWidgets.QWidget):
                     rotated = False
                 # Save crop image
                 # cv2.imwrite(f"cropped_image_{detection_idx + 1}.png", cropped_image)
-                logger.debug(f"Saved cropped image for detection {detection_idx + 1}: {cropped_image.shape}")
+                # logger.debug(f"Saved cropped image for detection {detection_idx + 1}: {cropped_image.shape}")
 
                 # logger.debug(f"Cropped image shape for detection {detection_idx + 1}: {cropped_image.shape}")
             except Exception as e:
@@ -1247,7 +1247,7 @@ class Canvas(QtWidgets.QWidget):
                     mask = scaled_mask
                 
                 # cv2.imwrite(f"scaled_segmentation_mask_{detection_idx + 1}.png", mask)
-                logger.debug(f"Saved segmentation mask for detection {detection_idx + 1}.")
+                # logger.debug(f"Saved segmentation mask for detection {detection_idx + 1}.")
                 
                 # Step 7: Find contours
                 contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
@@ -1269,7 +1269,7 @@ class Canvas(QtWidgets.QWidget):
                     cropped_with_rects = cropped_image.copy()
                     cv2.drawContours(cropped_with_rects, [box_points], -1, (0, 255, 0), 2)
                     # cv2.imwrite(f"cropped_with_rects_{detection_idx + 1}.png", cropped_with_rects)
-                    logger.debug(f"Saved cropped image with rectangles for detection {detection_idx + 1}.")
+                    # logger.debug(f"Saved cropped image with rectangles for detection {detection_idx + 1}.")
 
                     if rotated:
                         # Scale points from mask space to the original image space with rotation