From 2f13aebc6eed190d1681377d583b32eb7bbacec9 Mon Sep 17 00:00:00 2001
From: yadonglu <yadonglu@microsoft.com>
Date: Tue, 26 Nov 2024 13:17:15 -0800
Subject: [PATCH] update readme

---
 .gitignore                          |  1 +
 README.md                           |  2 +-
 demo.ipynb                          | 16 ++++++++--------
 weights/convert_safetensor_to_pt.py |  2 +-
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/.gitignore b/.gitignore
index 672c6ae..e455714 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ weights/icon_detect/
 weights/icon_detect_v1_5/
 .gradio
 __pycache__/
+debug.ipynb
diff --git a/README.md b/README.md
index 7d813ff..9e4ef3c 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@
 **OmniParser** is a comprehensive method for parsing user interface screenshots into structured and easy-to-understand elements, which significantly enhances the ability of GPT-4V to generate actions that can be accurately grounded in the corresponding regions of the interface. 
 
 ## News
-- [2024/11/26] We release an updated version, OmniParser V1.5 which features more fine grained/small icon detection. Examples in the demo.ipynb. 
+- [2024/11/26] We release an updated version, OmniParser V1.5 which features 1) more fine grained/small icon detection, 2) prediction of whether each screen element is interactable or not. Examples in the demo.ipynb. 
 - [2024/10] OmniParser was the #1 trending model on huggingface model hub (starting 10/29/2024). 
 - [2024/10] Feel free to checkout our demo on [huggingface space](https://huggingface.co/spaces/microsoft/OmniParser)! (stay tuned for OmniParser + Claude Computer Use)
 - [2024/10] Both Interactive Region Detection Model and Icon functional description model are released! [Hugginface models](https://huggingface.co/microsoft/OmniParser)
diff --git a/demo.ipynb b/demo.ipynb
index 6f34af0..bd280c8 100644
--- a/demo.ipynb
+++ b/demo.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
@@ -28,7 +28,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -75,7 +75,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -84,8 +84,8 @@
      "text": [
       "image size: (1919, 1079)\n",
       "\n",
-      "image 1/1 /home/yadonglu/OmniParser/imgs/word.png: 736x1280 138 0s, 7.5ms\n",
-      "Speed: 5.4ms preprocess, 7.5ms inference, 5.1ms postprocess per image at shape (1, 3, 736, 1280)\n"
+      "image 1/1 /home/yadonglu/OmniParser/imgs/word.png: 736x1280 138 0s, 61.1ms\n",
+      "Speed: 9.0ms preprocess, 61.1ms inference, 432.6ms postprocess per image at shape (1, 3, 736, 1280)\n"
      ]
     }
    ],
@@ -129,16 +129,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<matplotlib.image.AxesImage at 0x7f64287ae000>"
+       "<matplotlib.image.AxesImage at 0x7ff76a9f3110>"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     },
diff --git a/weights/convert_safetensor_to_pt.py b/weights/convert_safetensor_to_pt.py
index 901aa09..d45869f 100644
--- a/weights/convert_safetensor_to_pt.py
+++ b/weights/convert_safetensor_to_pt.py
@@ -16,7 +16,7 @@ if args.version == 'v1':
     torch.save({'model':model}, 'weights/icon_detect/best.pt')
 elif args.version == 'v1_5':
     print("Converting v1_5")
-    tensor_dict = torch.load("weights/icon_detect_v1_5/model.safetensors")
+    tensor_dict = load_file("weights/icon_detect_v1_5/model.safetensors")
     model = DetectionModel('weights/icon_detect_v1_5/model.yaml')
     model.load_state_dict(tensor_dict)
     save_dict = {'model':model}