def get_perception_infos(adb_path, screenshot_file):
get_screenshot(adb_path)
width, height = Image.open(screenshot_file).size
text, coordinates = ocr(screenshot_file, ocr_detection, ocr_recognition)
text, coordinates = merge_text_blocks(text, coordinates)
[(coordinate[0] + coordinate[2]) / 2, (coordinate[1] + coordinate[3]) / 2]
for coordinate in coordinates
# 将几何中心画上红色圆点并保存到screenshot_file
draw_coordinates_on_image(screenshot_file, center_list)
# 将text及coordinates存放到perception_infos
for i in range(len(coordinates)):
perception_info = {"text": "text: " + text[i], "coordinates": coordinates[i]}
perception_infos.append(perception_info)
# 使用groundingdino_model检测屏幕中的图标,并返回图标位置
coordinates = det(screenshot_file, "icon", groundingdino_model)
# 将图标位置存入perception_infos
for i in range(len(coordinates)):
perception_info = {"text": "icon", "coordinates": coordinates[i]}
perception_infos.append(perception_info)
for i in range(len(perception_infos)):
if perception_infos[i]["text"] == "icon":
image_box.append(perception_infos[i]["coordinates"])
for i in range(len(image_box)):
crop(screenshot_file, image_box[i], image_id[i])
images = get_all_files_in_folder(temp_file)
images = sorted(images, key=lambda x: int(x.split("/")[-1].split(".")[0]))
image_id = [int(image.split("/")[-1].split(".")[0]) for image in images]
# 使用多模态大模型对图标进行描述,并将结果存入icon_map
prompt = "This image is an icon from a phone screen. Please briefly describe the shape and color of this icon in one sentence."
if caption_call_method == "local":
for i in range(len(images)):
image_path = os.path.join(temp_file, images[i])
icon_width, icon_height = Image.open(image_path).size
icon_height > 0.8 * height
or icon_width * icon_height > 0.2 * width * height
des = generate_local(tokenizer, model, image_path, prompt)
for i in range(len(images)):
images[i] = os.path.join(temp_file, images[i])
icon_map = generate_api(images, prompt)
# 将icon_map中信息整合到perception_infos
for i, j in zip(image_id, range(1, len(image_id) + 1)):
perception_infos[i]["text"] = "icon: " + icon_map[j]
for i in range(len(perception_infos)):
perception_infos[i]["coordinates"] = [
perception_infos[i]["coordinates"][0]
+ perception_infos[i]["coordinates"][2]
perception_infos[i]["coordinates"][1]
+ perception_infos[i]["coordinates"][3]
return perception_infos, width, height