Forum: >>> Magnum BBS <<<

Re: Predicting an object over an pretrained model is not working

From Thomas Passin@21:1/5 to marc nicole via Python-list on Tue Jul 30 15:25:39 2024

On 7/30/2024 2:18 PM, marc nicole via Python-list wrote:

Hello all,

I want to predict an object by given as input an image and want to have my model be able to predict the label. I have trained a model using tensorflow based on annotated database where the target object to predict was added to the pretrained model. the code I am using is the following where I set the target object image as input and want to have the prediction output:

class MultiObjectDetection():

def __init__(self, classes_name):

self._classes_name = classes_name
self._num_classes = len(classes_name)

self._common_params = {'image_size': 448, 'num_classes': self._num_classes,
'batch_size':1}
self._net_params = {'cell_size': 7, 'boxes_per_cell':2, 'weight_decay': 0.0005}
self._net = YoloTinyNet(self._common_params, self._net_params, test=True)

def predict_object(self, image):
predicts = self._net.inference(image)
return predicts

def process_predicts(self, resized_img, predicts, thresh=0.2):
"""
process the predicts of object detection with one image input.

Args:
resized_img: resized source image.
predicts: output of the model.
thresh: thresh of bounding box confidence.
Return:
predicts_dict: {"stick": [[x1, y1, x2, y2, scores1], [...]]}.
"""
cls_num = self._num_classes
bbx_per_cell = self._net_params["boxes_per_cell"]
cell_size = self._net_params["cell_size"]
img_size = self._common_params["image_size"]
p_classes = predicts[0, :, :, 0:cls_num]
C = predicts[0, :, :, cls_num:cls_num+bbx_per_cell] # two
bounding boxes in one cell.
coordinate = predicts[0, :, :, cls_num+bbx_per_cell:] # all
bounding boxes position.

p_classes = np.reshape(p_classes, (cell_size, cell_size, 1, cls_num))
C = np.reshape(C, (cell_size, cell_size, bbx_per_cell, 1))

P = C * p_classes # confidencefor all classes of all bounding
boxes (cell_size, cell_size, bounding_box_num, class_num) = (7, 7, 2,
1).

predicts_dict = {}
for i in range(cell_size):
for j in range(cell_size):
temp_data = np.zeros_like(P, np.float32)
temp_data[i, j, :, :] = P[i, j, :, :]
position = np.argmax(temp_data) # refer to the class
num (with maximum confidence) for every bounding box.
index = np.unravel_index(position, P.shape)

if P[index] > thresh:
class_num = index[-1]
coordinate = np.reshape(coordinate, (cell_size, cell_size, bbx_per_cell, 4)) # (cell_size, cell_size,
bbox_num_per_cell, coordinate)[xmin, ymin, xmax, ymax]
max_coordinate = coordinate[index[0], index[1], index[2], :]

xcenter = max_coordinate[0]
ycenter = max_coordinate[1]
w = max_coordinate[2]
h = max_coordinate[3]

xcenter = (index[1] + xcenter) * (1.0*img_size /cell_size)
ycenter = (index[0] + ycenter) * (1.0*img_size /cell_size)

w = w * img_size
h = h * img_size
xmin = 0 if (xcenter - w/2.0 < 0) else (xcenter - w/2.0)
ymin = 0 if (xcenter - w/2.0 < 0) else (ycenter - h/2.0)
xmax = resized_img.shape[0] if (xmin + w) > resized_img.shape[0] else (xmin + w)
ymax = resized_img.shape[1] if (ymin + h) > resized_img.shape[1] else (ymin + h)

class_name = self._classes_name[class_num]
predicts_dict.setdefault(class_name, [])
predicts_dict[class_name].append([int(xmin),
int(ymin), int(xmax), int(ymax), P[index]])

return predicts_dict

def non_max_suppress(self, predicts_dict, threshold=0.5):
"""
implement non-maximum supression on predict bounding boxes.
Args:
predicts_dict: {"stick": [[x1, y1, x2, y2, scores1], [...]]}.
threshhold: iou threshold
Return:
predicts_dict processed by non-maximum suppression
"""
for object_name, bbox in predicts_dict.items():
bbox_array = np.array(bbox, dtype=np.float)
x1, y1, x2, y2, scores = bbox_array[:,0], bbox_array[:,1], bbox_array[:,2], bbox_array[:,3], bbox_array[:,4]
areas = (x2-x1+1) * (y2-y1+1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
inter = np.maximum(0.0, xx2-xx1+1) * np.maximum(0.0, yy2-yy1+1)
iou = inter/(areas[i]+areas[order[1:]]-inter)
indexs = np.where(iou<=threshold)[0]
order = order[indexs+1]
bbox = bbox_array[keep]
predicts_dict[object_name] = bbox.tolist()
predicts_dict = predicts_dict
return predicts_dict

class_names = ["aeroplane", "bicycle", "bird", "boat", "bottle",
"bus", "car", "cat", "chair", "cow", "diningtable",
"dog", "horse", "motorbike", "person",
"pottedplant", "sheep", "sofa", "train", "tvmonitor",
"small_ball"]
modelFile = ('models\\train\\model.ckpt-0')
track_object = "small_ball"print("object detection and tracking...")

multiObjectDetect = MultiObjectDetection(IP, class_names)
image = tf.placeholder(tf.float32, (1, 448, 448, 3))
object_predicts = multiObjectDetect.predict_object(image)

sess = tf.Session()
saver = tf.train.Saver(multiObjectDetect._net.trainable_collection)

saver.restore(sess, modelFile)

index = 0while 1:

src_img = cv2.imread("./weirdobject.jpg")
resized_img = cv2.resize(src_img, (448, 448))

np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
np_img = np_img.astype(np.float32)
np_img = np_img / 255.0 * 2 - 1
np_img = np.reshape(np_img, (1, 448, 448, 3))

np_predict = sess.run(object_predicts, feed_dict={image: np_img})
predicts_dict = multiObjectDetect.process_predicts(resized_img, np_predict)
predicts_dict = multiObjectDetect.non_max_suppress(predicts_dict)

print ("predict dict = ", predicts_dict)

The problem with this code is that the predicts_dict returns:

predict dict = {'sheep': [[233.0, 92.0, 448.0, -103.0,
5.3531270027160645], [167.0, 509.0, 209.0, 101.0, 4.947688579559326],
[0.0, 0.0, 448.0, 431.0, 3.393721580505371]], 'horse': [[374.0, 33.0,
282.0, 448.0, 5.277851581573486], [135.0, 688.0, -33.0, -14.0, 3.5144259929656982], [1.0, 117.0, 112.0, -138.0, 2.656987190246582]], 'bicycle': [[461.0, 781.0, 154.0, -381.0, 5.918102741241455], [70.0,
344.0, 391.0, -138.0, 3.031444787979126], [378.0, 497.0, 46.0, 149.0, 2.7629122734069824], [541.0, 583.0, 69.0, 307.0, 2.7170517444610596],
[323.0, 22.0, 336.0, 448.0, 1.608760952949524]], 'bottle': [[390.0,
218.0, -199.0, 448.0, 4.582971096038818], [0.0, 0.0, 448.0, -410.0, 0.9097045063972473]], 'sofa': [[346.0, 102.0, 323.0, -38.0, 2.371835947036743]], 'dog': [[319.0, 254.0, -282.0, 373.0, 4.022889137268066]], 'cat': [[63.0, -195.0, 365.0, -92.0, 3.5134828090667725]], 'person': [[22.0, -122.0, 154.0, 448.0, 3.927537441253662], [350.0, 155.0, -36.0, -445.0, 2.679833173751831],
[119.0, 416.0, -43.0, 292.0, 0.9529445171356201], [251.0, 445.0,
225.0, 188.0, 0.9001350402832031]], 'train': [[329.0, 485.0, -24.0,
-235.0, 2.7050414085388184], [483.0, 362.0, 237.0, -86.0,
2.555817127227783], [13.0, 365.0, 373.0, 448.0, 0.6229299902915955]], 'small_ball': [[217.0, 737.0, 448.0, -315.0, 1.739920973777771],
[117.0, 283.0, 153.0, 122.0, 1.5690066814422607]], 'boat': [[164.0,
805.0, 34.0, -169.0, 4.972668170928955], [0.0, 0.0, 397.0, 69.0, 2.353729486465454], [302.0, 605.0, 15.0, -22.0, 2.0259625911712646]], 'aeroplane': [[470.0, 616.0, -305.0, -37.0, 3.431873321533203], [0.0,
0.0, 448.0, -72.0, 2.836672306060791]], 'bus': [[0.0, 0.0, -101.0,
-280.0, 1.2078320980072021]], 'pottedplant': [[620.0, -268.0, -124.0,
418.0, 2.158564805984497], [0.0, 0.0, 448.0, -779.0,
1.6623022556304932]], 'tvmonitor': [[0.0, 0.0, 448.0, 85.0, 3.238999128341675], [240.0, 772.0, 200.0, 91.0, 1.7443398237228394],
[546.0, 155.0, 448.0, 448.0, 1.1334525346755981], [107.0, 441.0,
432.0, 219.0, 0.5971617698669434]], 'chair': [[470.0, -187.0, 106.0,
235.0, 3.8548083305358887], [524.0, 740.0, -103.0, 99.0,
3.636549234390259], [0.0, 0.0, 275.0, -325.0, 3.0997846126556396],
[711.0, -231.0, -146.0, 392.0, 2.205275535583496]], 'diningtable':
[[138.0, -310.0, 111.0, 448.0, 4.660728931427002], [317.0, -66.0,
313.0, 6.0, 4.535496234893799], [0.0, 0.0, -41.0, 175.0,
1.8571208715438843], [21.0, -92.0, 76.0, 172.0, 1.2035608291625977],
[0.0, 0.0, 448.0, -250.0, 1.00322687625885]], 'car': [[312.0, 232.0,
132.0, 309.0, 3.205225706100464], [514.0, -76.0, 218.0, 448.0, 1.4289973974227905], [0.0, 0.0, 448.0, 142.0, 0.7124998569488525]]}

WHile I expect only the dict to contain the small_ball key

How's that is possible? where's the prediction output?How to fix the code?

Without trying to figure out all that code, why would you expect only
results for a single key? An ML system is going to compute
probabilities and parameters for all objects it knows about (presumably
subject to some threshold).

--- SoupGate-Win32 v1.05
* Origin: fsxNet Usenet Gateway (21:1/5)

From Thomas Passin@21:1/5 to marc nicole on Tue Jul 30 17:45:20 2024

On 7/30/2024 4:49 PM, marc nicole wrote:

OK, but how's the probability of small_ball greater than others? I can't
find it anyway, what's its value?

It's your code. I wouldn't know. I suppose it's represented somewhere in
all those parameters. You need to understand what those function calls
are returning. It's documented somewhere, right?

And you really do need to know the probabilities of the competing images because otherwise you won't know how confident you can be that the identification is a strong one.

Le mar. 30 juil. 2024 à 21:37, Thomas Passin via Python-list <python-list@python.org <mailto:python-list@python.org>> a écrit :

On 7/30/2024 2:18 PM, marc nicole via Python-list wrote:
> Hello all,
>
> I want to predict an object by given as input an image and want
to have my
> model be able to predict the label. I have trained a model using
tensorflow
> based on annotated database where the target object to predict
was added to
> the pretrained model. the code I am using is the following where
I set the
> target object image as input and want to have the prediction output:
>
>
>
>
>
>
>
>
> class MultiObjectDetection():
>
> def __init__(self, classes_name):
>
> self._classes_name = classes_name
> self._num_classes = len(classes_name)
>
> self._common_params = {'image_size': 448, 'num_classes':
> self._num_classes,
> 'batch_size':1}
> self._net_params = {'cell_size': 7, 'boxes_per_cell':2,
> 'weight_decay': 0.0005}
> self._net = YoloTinyNet(self._common_params,
self._net_params,
> test=True)
>
> def predict_object(self, image):
> predicts = self._net.inference(image)
> return predicts
>
> def process_predicts(self, resized_img, predicts, thresh=0.2):
> """
> process the predicts of object detection with one image
input.
>
> Args:
> resized_img: resized source image.
> predicts: output of the model.
> thresh: thresh of bounding box confidence.
> Return:
> predicts_dict: {"stick": [[x1, y1, x2, y2, scores1],
[...]]}.
> """
> cls_num = self._num_classes
> bbx_per_cell = self._net_params["boxes_per_cell"]
> cell_size = self._net_params["cell_size"]
> img_size = self._common_params["image_size"]
> p_classes = predicts[0, :, :, 0:cls_num]
> C = predicts[0, :, :, cls_num:cls_num+bbx_per_cell] # two
> bounding boxes in one cell.
> coordinate = predicts[0, :, :, cls_num+bbx_per_cell:] # all
> bounding boxes position.
>
> p_classes = np.reshape(p_classes, (cell_size, cell_size,
1, cls_num))
> C = np.reshape(C, (cell_size, cell_size, bbx_per_cell, 1))
>
> P = C * p_classes # confidencefor all classes of all
bounding
> boxes (cell_size, cell_size, bounding_box_num, class_num) = (7, 7, 2,
> 1).
>
> predicts_dict = {}
> for i in range(cell_size):
> for j in range(cell_size):
> temp_data = np.zeros_like(P, np.float32)
> temp_data[i, j, :, :] = P[i, j, :, :]
> position = np.argmax(temp_data) # refer to the class
> num (with maximum confidence) for every bounding box.
> index = np.unravel_index(position, P.shape)
>
> if P[index] > thresh:
> class_num = index[-1]
> coordinate = np.reshape(coordinate, (cell_size,
> cell_size, bbx_per_cell, 4)) # (cell_size, cell_size,
> bbox_num_per_cell, coordinate)[xmin, ymin, xmax, ymax]
> max_coordinate = coordinate[index[0],
index[1], index[2], :]
>
> xcenter = max_coordinate[0]
> ycenter = max_coordinate[1]
> w = max_coordinate[2]
> h = max_coordinate[3]
>
> xcenter = (index[1] + xcenter) *
(1.0*img_size /cell_size)
> ycenter = (index[0] + ycenter) *
(1.0*img_size /cell_size)
>
> w = w * img_size
> h = h * img_size
> xmin = 0 if (xcenter - w/2.0 < 0) else
(xcenter - w/2.0)
> ymin = 0 if (xcenter - w/2.0 < 0) else
(ycenter - h/2.0)
> xmax = resized_img.shape[0] if (xmin + w) >
> resized_img.shape[0] else (xmin + w)
> ymax = resized_img.shape[1] if (ymin + h) >
> resized_img.shape[1] else (ymin + h)
>
> class_name = self._classes_name[class_num]
> predicts_dict.setdefault(class_name, [])
> predicts_dict[class_name].append([int(xmin),
> int(ymin), int(xmax), int(ymax), P[index]])
>
> return predicts_dict
>
> def non_max_suppress(self, predicts_dict, threshold=0.5):
> """
> implement non-maximum supression on predict bounding boxes.
> Args:
> predicts_dict: {"stick": [[x1, y1, x2, y2, scores1],
[...]]}.
> threshhold: iou threshold
> Return:
> predicts_dict processed by non-maximum suppression
> """
> for object_name, bbox in predicts_dict.items():
> bbox_array = np.array(bbox, dtype=np.float)
> x1, y1, x2, y2, scores = bbox_array[:,0],
bbox_array[:,1],
> bbox_array[:,2], bbox_array[:,3], bbox_array[:,4]
> areas = (x2-x1+1) * (y2-y1+1)
> order = scores.argsort()[::-1]
> keep = []
> while order.size > 0:
> i = order[0]
> keep.append(i)
> xx1 = np.maximum(x1[i], x1[order[1:]])
> yy1 = np.maximum(y1[i], y1[order[1:]])
> xx2 = np.minimum(x2[i], x2[order[1:]])
> yy2 = np.minimum(y2[i], y2[order[1:]])
> inter = np.maximum(0.0, xx2-xx1+1) *
np.maximum(0.0, yy2-yy1+1)
> iou = inter/(areas[i]+areas[order[1:]]-inter)
> indexs = np.where(iou<=threshold)[0]
> order = order[indexs+1]
> bbox = bbox_array[keep]
> predicts_dict[object_name] = bbox.tolist()
> predicts_dict = predicts_dict
> return predicts_dict
>
>
>
> class_names = ["aeroplane", "bicycle", "bird", "boat", "bottle",
> "bus", "car", "cat", "chair", "cow", "diningtable",
> "dog", "horse", "motorbike", "person",
> "pottedplant", "sheep", "sofa", "train", "tvmonitor",
> "small_ball"]
> modelFile = ('models\\train\\model.ckpt-0')
> track_object = "small_ball"print("object detection and tracking...")
>
> multiObjectDetect = MultiObjectDetection(IP, class_names)
> image = tf.placeholder(tf.float32, (1, 448, 448, 3))
> object_predicts = multiObjectDetect.predict_object(image)
>
>
>
> sess = tf.Session()
> saver = tf.train.Saver(multiObjectDetect._net.trainable_collection)
>
>
> saver.restore(sess, modelFile)
>
> index = 0while 1:
>
> src_img = cv2.imread("./weirdobject.jpg")
> resized_img = cv2.resize(src_img, (448, 448))
>
> np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
> np_img = np_img.astype(np.float32)
> np_img = np_img / 255.0 * 2 - 1
> np_img = np.reshape(np_img, (1, 448, 448, 3))
>
>
> np_predict = sess.run(object_predicts, feed_dict={image:
np_img})
> predicts_dict =
multiObjectDetect.process_predicts(resized_img, np_predict)
> predicts_dict =
multiObjectDetect.non_max_suppress(predicts_dict)
>
> print ("predict dict = ", predicts_dict)
>
>
>
>
>
>
>
> The problem with this code is that the predicts_dict returns:
>
>
>
> predict dict = {'sheep': [[233.0, 92.0, 448.0, -103.0,
> 5.3531270027160645], [167.0, 509.0, 209.0, 101.0, 4.947688579559326],
> [0.0, 0.0, 448.0, 431.0, 3.393721580505371]], 'horse': [[374.0, 33.0,
> 282.0, 448.0, 5.277851581573486], [135.0, 688.0, -33.0, -14.0,
> 3.5144259929656982], [1.0, 117.0, 112.0, -138.0, 2.656987190246582]],
> 'bicycle': [[461.0, 781.0, 154.0, -381.0, 5.918102741241455], [70.0,
> 344.0, 391.0, -138.0, 3.031444787979126], [378.0, 497.0, 46.0, 149.0,
> 2.7629122734069824], [541.0, 583.0, 69.0, 307.0, 2.7170517444610596],
> [323.0, 22.0, 336.0, 448.0, 1.608760952949524]], 'bottle': [[390.0,
> 218.0, -199.0, 448.0, 4.582971096038818], [0.0, 0.0, 448.0, -410.0,
> 0.9097045063972473]], 'sofa': [[346.0, 102.0, 323.0, -38.0,
> 2.371835947036743]], 'dog': [[319.0, 254.0, -282.0, 373.0,
> 4.022889137268066]], 'cat': [[63.0, -195.0, 365.0, -92.0,
> 3.5134828090667725]], 'person': [[22.0, -122.0, 154.0, 448.0,
> 3.927537441253662], [350.0, 155.0, -36.0, -445.0, 2.679833173751831],
> [119.0, 416.0, -43.0, 292.0, 0.9529445171356201], [251.0, 445.0,
> 225.0, 188.0, 0.9001350402832031]], 'train': [[329.0, 485.0, -24.0,
> -235.0, 2.7050414085388184], [483.0, 362.0, 237.0, -86.0,
> 2.555817127227783], [13.0, 365.0, 373.0, 448.0, 0.6229299902915955]],
> 'small_ball': [[217.0, 737.0, 448.0, -315.0, 1.739920973777771],
> [117.0, 283.0, 153.0, 122.0, 1.5690066814422607]], 'boat': [[164.0,
> 805.0, 34.0, -169.0, 4.972668170928955], [0.0, 0.0, 397.0, 69.0,
> 2.353729486465454], [302.0, 605.0, 15.0, -22.0, 2.0259625911712646]],
> 'aeroplane': [[470.0, 616.0, -305.0, -37.0, 3.431873321533203], [0.0,
> 0.0, 448.0, -72.0, 2.836672306060791]], 'bus': [[0.0, 0.0, -101.0,
> -280.0, 1.2078320980072021]], 'pottedplant': [[620.0, -268.0, -124.0,
> 418.0, 2.158564805984497], [0.0, 0.0, 448.0, -779.0,
> 1.6623022556304932]], 'tvmonitor': [[0.0, 0.0, 448.0, 85.0,
> 3.238999128341675], [240.0, 772.0, 200.0, 91.0, 1.7443398237228394],
> [546.0, 155.0, 448.0, 448.0, 1.1334525346755981], [107.0, 441.0,
> 432.0, 219.0, 0.5971617698669434]], 'chair': [[470.0, -187.0, 106.0,
> 235.0, 3.8548083305358887], [524.0, 740.0, -103.0, 99.0,
> 3.636549234390259], [0.0, 0.0, 275.0, -325.0, 3.0997846126556396],
> [711.0, -231.0, -146.0, 392.0, 2.205275535583496]], 'diningtable':
> [[138.0, -310.0, 111.0, 448.0, 4.660728931427002], [317.0, -66.0,
> 313.0, 6.0, 4.535496234893799], [0.0, 0.0, -41.0, 175.0,
> 1.8571208715438843], [21.0, -92.0, 76.0, 172.0, 1.2035608291625977],
> [0.0, 0.0, 448.0, -250.0, 1.00322687625885]], 'car': [[312.0, 232.0,
> 132.0, 309.0, 3.205225706100464], [514.0, -76.0, 218.0, 448.0,
> 1.4289973974227905], [0.0, 0.0, 448.0, 142.0, 0.7124998569488525]]}
>
>
> WHile I expect only the dict to contain the small_ball key
>
>
>
> How's that is possible? where's the prediction output?How to fix
the code?

Without trying to figure out all that code, why would you expect only
results for a single key? An ML system is going to compute
probabilities and parameters for all objects it knows about (presumably
subject to some threshold).

--
https://mail.python.org/mailman/listinfo/python-list
<https://mail.python.org/mailman/listinfo/python-list>

--- SoupGate-Win32 v1.05
* Origin: fsxNet Usenet Gateway (21:1/5)

From dn@21:1/5 to marc nicole via Python-list on Wed Jul 31 10:16:29 2024

On 31/07/24 06:18, marc nicole via Python-list wrote:

Hello all,

I want to predict an object by given as input an image and want to have my model be able to predict the label. I have trained a model using tensorflow based on annotated database where the target object to predict was added to the pretrained model. the code I am using is the following where I set the target object image as input and want to have the prediction output:

...

WHile I expect only the dict to contain the small_ball key

How's that is possible? where's the prediction output?How to fix the code?

To save us lots of reading and study to be able to help you, please advise:

1 what are the meanings of all these numbers?

'sheep': [[233.0, 92.0, 448.0, -103.0,

5.3531270027160645], [167.0, 509.0, 209.0, 101.0, 4.947688579559326],
[0.0, 0.0, 448.0, 431.0, 3.393721580505371]]

2 (assuming it hasn't) why the dict has not been sorted into a
meaningful order

3 how can one tell that the image is more likely to be a sheep than a train?

--
Regards,
=dn

--- SoupGate-Win32 v1.05
* Origin: fsxNet Usenet Gateway (21:1/5)

Sysop:	Keyop
Location:	Huddersfield, West Yorkshire, UK
Users:	546
Nodes:	16 (2 / 14)
Uptime:	30:44:42
Calls:	10,391
Calls today:	2
Files:	14,064
Messages:	6,417,098

Re: Predicting an object over an pretrained model is not working

Who's Online

Recent Visitors

System Info