[ { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many pedestal fans are in the scene?", "response_gt": [ "1", "1 pedestal fans", "I see 1", "I see 1 pedestal fans", "I can count 1 pedestal fans" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "4", "4 windows", "I see 4", "I see 4 windows", "I can count 4 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "3", "3 boxs", "I see 3", "I see 3 boxs", "I can count 3 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many electrical ducts are in the scene?", "response_gt": [ "2", "2 electrical ducts", "I see 2", "I see 2 electrical ducts", "I can count 2 electrical ducts" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "4", "4 ceiling lamps", "I see 4", "I see 4 ceiling lamps", "I can count 4 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many whiteboard erasers are in the scene?", "response_gt": [ "1", "1 whiteboard erasers", "I see 1", "I see 1 whiteboard erasers", "I can count 1 whiteboard erasers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "2", "2 whiteboards", "I see 2", "I see 2 whiteboards", "I can count 2 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "5", "5 monitors", "I see 5", "I see 5 monitors", "I can count 5 monitors" ], "response_pred": "2 monitors" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many blindss are in the scene?", "response_gt": [ "2", "2 blindss", "I see 2", "I see 2 blindss", "I can count 2 blindss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many containers are in the scene?", "response_gt": [ "1", "1 containers", "I see 1", "I see 1 containers", "I can count 1 containers" ], "response_pred": "6 containers" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "5", "5 objects", "I see 5", "I see 5 objects", "I can count 5 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "2", "2 heaters", "I see 2", "I see 2 heaters", "I can count 2 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many storage cabinets are in the scene?", "response_gt": [ "2", "2 storage cabinets", "I see 2", "I see 2 storage cabinets", "I can count 2 storage cabinets" ], "response_pred": "3 storage cabinets" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many books are in the scene?", "response_gt": [ "1", "1 books", "I see 1", "I see 1 books", "I can count 1 books" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many telephones are in the scene?", "response_gt": [ "1", "1 telephones", "I see 1", "I see 1 telephones", "I can count 1 telephones" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many power strips are in the scene?", "response_gt": [ "1", "1 power strips", "I see 1", "I see 1 power strips", "I can count 1 power strips" ], "response_pred": "3 power strips" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many cables are in the scene?", "response_gt": [ "2", "2 cables", "I see 2", "I see 2 cables", "I can count 2 cables" ], "response_pred": "2 cables" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many office chairs are in the scene?", "response_gt": [ "4", "4 office chairs", "I see 4", "I see 4 office chairs", "I can count 4 office chairs" ], "response_pred": "4 office chairs" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "3", "3 cabinets", "I see 3", "I see 3 cabinets", "I can count 3 cabinets" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "3", "3 tables", "I see 3", "I see 3 tables", "I can count 3 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "1", "1 trash cans", "I see 1", "I see 1 trash cans", "I can count 1 trash cans" ], "response_pred": "4 trash cans" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many bukets are in the scene?", "response_gt": [ "1", "1 bukets", "I see 1", "I see 1 bukets", "I can count 1 bukets" ], "response_pred": "2 bukets" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many pen holders are in the scene?", "response_gt": [ "2", "2 pen holders", "I see 2", "I see 2 pen holders", "I can count 2 pen holders" ], "response_pred": "3 pen holders" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many coat racks are in the scene?", "response_gt": [ "1", "1 coat racks", "I see 1", "I see 1 coat racks", "I can count 1 coat racks" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many pipes are in the scene?", "response_gt": [ "2", "2 pipes", "I see 2", "I see 2 pipes", "I can count 2 pipes" ], "response_pred": "2 pipes" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many bookshelfs are in the scene?", "response_gt": [ "1", "1 bookshelfs", "I see 1", "I see 1 bookshelfs", "I can count 1 bookshelfs" ], "response_pred": "3 bookshelves" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "2", "2 keyboards", "I see 2", "I see 2 keyboards", "I can count 2 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "1", "1 bottles", "I see 1", "I see 1 bottles", "I can count 1 bottles" ], "response_pred": "6 bottles" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many cups are in the scene?", "response_gt": [ "1", "1 cups", "I see 1", "I see 1 cups", "I can count 1 cups" ], "response_pred": "2 cups" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many mouses are in the scene?", "response_gt": [ "1", "1 mouses", "I see 1", "I see 1 mouses", "I can count 1 mouses" ], "response_pred": "2 mouses" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many balls are in the scene?", "response_gt": [ "1", "1 balls", "I see 1", "I see 1 balls", "I can count 1 balls" ], "response_pred": "2 balls" }, { "source": "scannet", "scene_id": "7b6477cb95", "instruction": " USER: How many hole punchers are in the scene?", "response_gt": [ "1", "1 hole punchers", "I see 1", "I see 1 hole punchers", "I can count 1 hole punchers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "3", "3 boxs", "I see 3", "I see 3 boxs", "I can count 3 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "2", "2 trash cans", "I see 2", "I see 2 trash cans", "I can count 2 trash cans" ], "response_pred": "4 trash cans" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many pipes are in the scene?", "response_gt": [ "4", "4 pipes", "I see 4", "I see 4 pipes", "I can count 4 pipes" ], "response_pred": "2 pipes" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "2", "2 heaters", "I see 2", "I see 2 heaters", "I can count 2 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many computer towers are in the scene?", "response_gt": [ "3", "3 computer towers", "I see 3", "I see 3 computer towers", "I can count 3 computer towers" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many blindss are in the scene?", "response_gt": [ "1", "1 blindss", "I see 1", "I see 1 blindss", "I can count 1 blindss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "3", "3 windows", "I see 3", "I see 3 windows", "I can count 3 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many blind rails are in the scene?", "response_gt": [ "1", "1 blind rails", "I see 1", "I see 1 blind rails", "I can count 1 blind rails" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "4", "4 objects", "I see 4", "I see 4 objects", "I can count 4 objects" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many desk fans are in the scene?", "response_gt": [ "1", "1 desk fans", "I see 1", "I see 1 desk fans", "I can count 1 desk fans" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many telephones are in the scene?", "response_gt": [ "1", "1 telephones", "I see 1", "I see 1 telephones", "I can count 1 telephones" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "6", "6 monitors", "I see 6", "I see 6 monitors", "I can count 6 monitors" ], "response_pred": "2 monitors" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many mugs are in the scene?", "response_gt": [ "2", "2 mugs", "I see 2", "I see 2 mugs", "I can count 2 mugs" ], "response_pred": "2 mugs" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many headphoness are in the scene?", "response_gt": [ "1", "1 headphoness", "I see 1", "I see 1 headphoness", "I can count 1 headphoness" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many food containers are in the scene?", "response_gt": [ "1", "1 food containers", "I see 1", "I see 1 food containers", "I can count 1 food containers" ], "response_pred": "4 food containers" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many vacuum flasks are in the scene?", "response_gt": [ "1", "1 vacuum flasks", "I see 1", "I see 1 vacuum flasks", "I can count 1 vacuum flasks" ], "response_pred": "2 vacuum flasks" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many plastic bottles are in the scene?", "response_gt": [ "5", "5 plastic bottles", "I see 5", "I see 5 plastic bottles", "I can count 5 plastic bottles" ], "response_pred": "2 plastic bottles" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many paperss are in the scene?", "response_gt": [ "2", "2 paperss", "I see 2", "I see 2 paperss", "I can count 2 paperss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many disinfectant dispensers are in the scene?", "response_gt": [ "1", "1 disinfectant dispensers", "I see 1", "I see 1 disinfectant dispensers", "I can count 1 disinfectant dispensers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many coasters are in the scene?", "response_gt": [ "1", "1 coasters", "I see 1", "I see 1 coasters", "I can count 1 coasters" ], "response_pred": "2 coasters" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many electrical ducts are in the scene?", "response_gt": [ "3", "3 electrical ducts", "I see 3", "I see 3 electrical ducts", "I can count 3 electrical ducts" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many vertical blind controls are in the scene?", "response_gt": [ "1", "1 vertical blind controls", "I see 1", "I see 1 vertical blind controls", "I can count 1 vertical blind controls" ], "response_pred": "2 vertical blind controls" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "4", "4 ceiling lamps", "I see 4", "I see 4 ceiling lamps", "I can count 4 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many office chairs are in the scene?", "response_gt": [ "4", "4 office chairs", "I see 4", "I see 4 office chairs", "I can count 4 office chairs" ], "response_pred": "4 office chairs" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "1", "1 whiteboards", "I see 1", "I see 1 whiteboards", "I can count 1 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "2", "2 keyboards", "I see 2", "I see 2 keyboards", "I can count 2 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many papers are in the scene?", "response_gt": [ "1", "1 papers", "I see 1", "I see 1 papers", "I can count 1 papers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many mouses are in the scene?", "response_gt": [ "2", "2 mouses", "I see 2", "I see 2 mouses", "I can count 2 mouses" ], "response_pred": "2 mouses" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many storage cabinets are in the scene?", "response_gt": [ "2", "2 storage cabinets", "I see 2", "I see 2 storage cabinets", "I can count 2 storage cabinets" ], "response_pred": "3 storage cabinets" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many mousepads are in the scene?", "response_gt": [ "1", "1 mousepads", "I see 1", "I see 1 mousepads", "I can count 1 mousepads" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many office tables are in the scene?", "response_gt": [ "4", "4 office tables", "I see 4", "I see 4 office tables", "I can count 4 office tables" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many coat hangers are in the scene?", "response_gt": [ "1", "1 coat hangers", "I see 1", "I see 1 coat hangers", "I can count 1 coat hangers" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many glass panes are in the scene?", "response_gt": [ "1", "1 glass panes", "I see 1", "I see 1 glass panes", "I can count 1 glass panes" ], "response_pred": "2 glass panes" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many cardboard boxs are in the scene?", "response_gt": [ "1", "1 cardboard boxs", "I see 1", "I see 1 cardboard boxs", "I can count 1 cardboard boxs" ], "response_pred": "2 cardboard boxes" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many wall outlets are in the scene?", "response_gt": [ "2", "2 wall outlets", "I see 2", "I see 2 wall outlets", "I can count 2 wall outlets" ], "response_pred": "2 wall outlets" }, { "source": "scannet", "scene_id": "c50d2d1d42", "instruction": " USER: How many carboard boxs are in the scene?", "response_gt": [ "1", "1 carboard boxs", "I see 1", "I see 1 carboard boxs", "I can count 1 carboard boxs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "38", "38 objects", "I see 38", "I see 38 objects", "I can count 38 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many bookss are in the scene?", "response_gt": [ "8", "8 bookss", "I see 8", "I see 8 bookss", "I can count 8 bookss" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many powerstrips are in the scene?", "response_gt": [ "1", "1 powerstrips", "I see 1", "I see 1 powerstrips", "I can count 1 powerstrips" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many power bricks are in the scene?", "response_gt": [ "1", "1 power bricks", "I see 1", "I see 1 power bricks", "I can count 1 power bricks" ], "response_pred": "2 power bricks" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many books are in the scene?", "response_gt": [ "6", "6 books", "I see 6", "I see 6 books", "I can count 6 books" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "3", "3 monitors", "I see 3", "I see 3 monitors", "I can count 3 monitors" ], "response_pred": "2 monitors" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many splits are in the scene?", "response_gt": [ "2", "2 splits", "I see 2", "I see 2 splits", "I can count 2 splits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "2", "2 doors", "I see 2", "I see 2 doors", "I can count 2 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many blindss are in the scene?", "response_gt": [ "2", "2 blindss", "I see 2", "I see 2 blindss", "I can count 2 blindss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "2", "2 windows", "I see 2", "I see 2 windows", "I can count 2 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "2", "2 chairs", "I see 2", "I see 2 chairs", "I can count 2 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "1", "1 tables", "I see 1", "I see 1 tables", "I can count 1 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many bookshelfs are in the scene?", "response_gt": [ "2", "2 bookshelfs", "I see 2", "I see 2 bookshelfs", "I can count 2 bookshelfs" ], "response_pred": "2 bookshelves" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many computer towers are in the scene?", "response_gt": [ "2", "2 computer towers", "I see 2", "I see 2 computer towers", "I can count 2 computer towers" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many desks are in the scene?", "response_gt": [ "1", "1 desks", "I see 1", "I see 1 desks", "I can count 1 desks" ], "response_pred": "3 desks" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many suitcases are in the scene?", "response_gt": [ "1", "1 suitcases", "I see 1", "I see 1 suitcases", "I can count 1 suitcases" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many rugs are in the scene?", "response_gt": [ "1", "1 rugs", "I see 1", "I see 1 rugs", "I can count 1 rugs" ], "response_pred": "2 rugs" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "4", "4 boxs", "I see 4", "I see 4 boxs", "I can count 4 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many vacuum cleaners are in the scene?", "response_gt": [ "1", "1 vacuum cleaners", "I see 1", "I see 1 vacuum cleaners", "I can count 1 vacuum cleaners" ], "response_pred": "2 vacuum cleaners" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many portable speakers are in the scene?", "response_gt": [ "1", "1 portable speakers", "I see 1", "I see 1 portable speakers", "I can count 1 portable speakers" ], "response_pred": "2 portable speakers" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many multiplugs are in the scene?", "response_gt": [ "2", "2 multiplugs", "I see 2", "I see 2 multiplugs", "I can count 2 multiplugs" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many laptops are in the scene?", "response_gt": [ "1", "1 laptops", "I see 1", "I see 1 laptops", "I can count 1 laptops" ], "response_pred": "2 laptops" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many air vents are in the scene?", "response_gt": [ "1", "1 air vents", "I see 1", "I see 1 air vents", "I can count 1 air vents" ], "response_pred": "2 air vents" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "1", "1 ceiling lamps", "I see 1", "I see 1 ceiling lamps", "I can count 1 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "1", "1 cabinets", "I see 1", "I see 1 cabinets", "I can count 1 cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "cc5237fd77", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "1", "1 keyboards", "I see 1", "I see 1 keyboards", "I can count 1 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many electrical ducts are in the scene?", "response_gt": [ "1", "1 electrical ducts", "I see 1", "I see 1 electrical ducts", "I can count 1 electrical ducts" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "2", "2 cabinets", "I see 2", "I see 2 cabinets", "I can count 2 cabinets" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many computer towers are in the scene?", "response_gt": [ "2", "2 computer towers", "I see 2", "I see 2 computer towers", "I can count 2 computer towers" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "4", "4 chairs", "I see 4", "I see 4 chairs", "I can count 4 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "1", "1 smoke detectors", "I see 1", "I see 1 smoke detectors", "I can count 1 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "4", "4 ceiling lamps", "I see 4", "I see 4 ceiling lamps", "I can count 4 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many wall hookss are in the scene?", "response_gt": [ "2", "2 wall hookss", "I see 2", "I see 2 wall hookss", "I can count 2 wall hookss" ], "response_pred": "2 wall hooks" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many storage cabinets are in the scene?", "response_gt": [ "1", "1 storage cabinets", "I see 1", "I see 1 storage cabinets", "I can count 1 storage cabinets" ], "response_pred": "3 storage cabinets" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "2", "2 trash cans", "I see 2", "I see 2 trash cans", "I can count 2 trash cans" ], "response_pred": "4 trash cans" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many bookshelfs are in the scene?", "response_gt": [ "1", "1 bookshelfs", "I see 1", "I see 1 bookshelfs", "I can count 1 bookshelfs" ], "response_pred": "4 bookshelf" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "12", "12 boxs", "I see 12", "I see 12 boxs", "I can count 12 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "10", "10 objects", "I see 10", "I see 10 objects", "I can count 10 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many power strips are in the scene?", "response_gt": [ "2", "2 power strips", "I see 2", "I see 2 power strips", "I can count 2 power strips" ], "response_pred": "three" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many cabless are in the scene?", "response_gt": [ "1", "1 cabless", "I see 1", "I see 1 cabless", "I can count 1 cabless" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many tripods are in the scene?", "response_gt": [ "1", "1 tripods", "I see 1", "I see 1 tripods", "I can count 1 tripods" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "1", "1 whiteboards", "I see 1", "I see 1 whiteboards", "I can count 1 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many whiteboard erasers are in the scene?", "response_gt": [ "1", "1 whiteboard erasers", "I see 1", "I see 1 whiteboard erasers", "I can count 1 whiteboard erasers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "3", "3 tables", "I see 3", "I see 3 tables", "I can count 3 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many papers are in the scene?", "response_gt": [ "6", "6 papers", "I see 6", "I see 6 papers", "I can count 6 papers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "2", "2 keyboards", "I see 2", "I see 2 keyboards", "I can count 2 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many mouses are in the scene?", "response_gt": [ "2", "2 mouses", "I see 2", "I see 2 mouses", "I can count 2 mouses" ], "response_pred": "2 mouses" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "3", "3 monitors", "I see 3", "I see 3 monitors", "I can count 3 monitors" ], "response_pred": "2 monitors" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many mugs are in the scene?", "response_gt": [ "2", "2 mugs", "I see 2", "I see 2 mugs", "I can count 2 mugs" ], "response_pred": "2 mugs" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many telephones are in the scene?", "response_gt": [ "2", "2 telephones", "I see 2", "I see 2 telephones", "I can count 2 telephones" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many headphones are in the scene?", "response_gt": [ "1", "1 headphones", "I see 1", "I see 1 headphones", "I can count 1 headphones" ], "response_pred": "2 headphones" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many laptops are in the scene?", "response_gt": [ "1", "1 laptops", "I see 1", "I see 1 laptops", "I can count 1 laptops" ], "response_pred": "2 laptops" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many books are in the scene?", "response_gt": [ "1", "1 books", "I see 1", "I see 1 books", "I can count 1 books" ], "response_pred": "3 books" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many fruits are in the scene?", "response_gt": [ "2", "2 fruits", "I see 2", "I see 2 fruits", "I can count 2 fruits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many filter jugs are in the scene?", "response_gt": [ "1", "1 filter jugs", "I see 1", "I see 1 filter jugs", "I can count 1 filter jugs" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many plugs are in the scene?", "response_gt": [ "4", "4 plugs", "I see 4", "I see 4 plugs", "I can count 4 plugs" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many outlets are in the scene?", "response_gt": [ "5", "5 outlets", "I see 5", "I see 5 outlets", "I can count 5 outlets" ], "response_pred": "2 outlets" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many tubes are in the scene?", "response_gt": [ "1", "1 tubes", "I see 1", "I see 1 tubes", "I can count 1 tubes" ], "response_pred": "4 tubes" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "2", "2 heaters", "I see 2", "I see 2 heaters", "I can count 2 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many pipes are in the scene?", "response_gt": [ "1", "1 pipes", "I see 1", "I see 1 pipes", "I can count 1 pipes" ], "response_pred": "2 pipes" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "2", "2 windows", "I see 2", "I see 2 windows", "I can count 2 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many backpacks are in the scene?", "response_gt": [ "1", "1 backpacks", "I see 1", "I see 1 backpacks", "I can count 1 backpacks" ], "response_pred": "2 backpacks" }, { "source": "scannet", "scene_id": "acd95847c5", "instruction": " USER: How many clothess are in the scene?", "response_gt": [ "1", "1 clothess", "I see 1", "I see 1 clothess", "I can count 1 clothess" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "12", "12 ceiling lamps", "I see 12", "I see 12 ceiling lamps", "I can count 12 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many sofas are in the scene?", "response_gt": [ "1", "1 sofas", "I see 1", "I see 1 sofas", "I can count 1 sofas" ], "response_pred": "2 sofas" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "4", "4 boxs", "I see 4", "I see 4 boxs", "I can count 4 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "18", "18 chairs", "I see 18", "I see 18 chairs", "I can count 18 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many speakers are in the scene?", "response_gt": [ "2", "2 speakers", "I see 2", "I see 2 speakers", "I can count 2 speakers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many outlets are in the scene?", "response_gt": [ "2", "2 outlets", "I see 2", "I see 2 outlets", "I can count 2 outlets" ], "response_pred": "2 outlets" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "2", "2 tables", "I see 2", "I see 2 tables", "I can count 2 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many air conditioners are in the scene?", "response_gt": [ "3", "3 air conditioners", "I see 3", "I see 3 air conditioners", "I can count 3 air conditioners" ], "response_pred": "2 air conditioners" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "2", "2 windows", "I see 2", "I see 2 windows", "I can count 2 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many bookshelfs are in the scene?", "response_gt": [ "1", "1 bookshelfs", "I see 1", "I see 1 bookshelfs", "I can count 1 bookshelfs" ], "response_pred": "3 bookshelf" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "3", "3 whiteboards", "I see 3", "I see 3 whiteboards", "I can count 3 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many bookss are in the scene?", "response_gt": [ "4", "4 bookss", "I see 4", "I see 4 bookss", "I can count 4 bookss" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "3", "3 smoke detectors", "I see 3", "I see 3 smoke detectors", "I can count 3 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many books are in the scene?", "response_gt": [ "1", "1 books", "I see 1", "I see 1 books", "I can count 1 books" ], "response_pred": "3 books" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many blackboards are in the scene?", "response_gt": [ "1", "1 blackboards", "I see 1", "I see 1 blackboards", "I can count 1 blackboards" ], "response_pred": "2 blackboards" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many pipes are in the scene?", "response_gt": [ "3", "3 pipes", "I see 3", "I see 3 pipes", "I can count 3 pipes" ], "response_pred": "2 pipes" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "1", "1 heaters", "I see 1", "I see 1 heaters", "I can count 1 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "1", "1 trash bins", "I see 1", "I see 1 trash bins", "I can count 1 trash bins" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many power strips are in the scene?", "response_gt": [ "1", "1 power strips", "I see 1", "I see 1 power strips", "I can count 1 power strips" ], "response_pred": "three" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many telephones are in the scene?", "response_gt": [ "1", "1 telephones", "I see 1", "I see 1 telephones", "I can count 1 telephones" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many toys are in the scene?", "response_gt": [ "2", "2 toys", "I see 2", "I see 2 toys", "I can count 2 toys" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many magazine racks are in the scene?", "response_gt": [ "1", "1 magazine racks", "I see 1", "I see 1 magazine racks", "I can count 1 magazine racks" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "1", "1 bottles", "I see 1", "I see 1 bottles", "I can count 1 bottles" ], "response_pred": "4 bottles" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many bottle sprays are in the scene?", "response_gt": [ "1", "1 bottle sprays", "I see 1", "I see 1 bottle sprays", "I can count 1 bottle sprays" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many pictures are in the scene?", "response_gt": [ "1", "1 pictures", "I see 1", "I see 1 pictures", "I can count 1 pictures" ], "response_pred": "3 pictures" }, { "source": "scannet", "scene_id": "fb5a96b1a2", "instruction": " USER: How many mouses are in the scene?", "response_gt": [ "1", "1 mouses", "I see 1", "I see 1 mouses", "I can count 1 mouses" ], "response_pred": "2 mouses" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many curtains are in the scene?", "response_gt": [ "2", "2 curtains", "I see 2", "I see 2 curtains", "I can count 2 curtains" ], "response_pred": "2 curtains" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "5", "5 objects", "I see 5", "I see 5 objects", "I can count 5 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "1", "1 monitors", "I see 1", "I see 1 monitors", "I can count 1 monitors" ], "response_pred": "4 monitors" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many structures are in the scene?", "response_gt": [ "4", "4 structures", "I see 4", "I see 4 structures", "I can count 4 structures" ], "response_pred": "3 structures" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many splits are in the scene?", "response_gt": [ "3", "3 splits", "I see 3", "I see 3 splits", "I can count 3 splits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many office chairs are in the scene?", "response_gt": [ "1", "1 office chairs", "I see 1", "I see 1 office chairs", "I can count 1 office chairs" ], "response_pred": "4 office chairs" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many fitted wardrobes are in the scene?", "response_gt": [ "1", "1 fitted wardrobes", "I see 1", "I see 1 fitted wardrobes", "I can count 1 fitted wardrobes" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many blindss are in the scene?", "response_gt": [ "1", "1 blindss", "I see 1", "I see 1 blindss", "I can count 1 blindss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many pictures are in the scene?", "response_gt": [ "1", "1 pictures", "I see 1", "I see 1 pictures", "I can count 1 pictures" ], "response_pred": "2 pictures" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "2", "2 tables", "I see 2", "I see 2 tables", "I can count 2 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many folding sofas are in the scene?", "response_gt": [ "1", "1 folding sofas", "I see 1", "I see 1 folding sofas", "I can count 1 folding sofas" ], "response_pred": "2 folding sofas" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "1", "1 ceiling lamps", "I see 1", "I see 1 ceiling lamps", "I can count 1 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many headphones are in the scene?", "response_gt": [ "1", "1 headphones", "I see 1", "I see 1 headphones", "I can count 1 headphones" ], "response_pred": "2 headphones" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many bookss are in the scene?", "response_gt": [ "1", "1 bookss", "I see 1", "I see 1 bookss", "I can count 1 bookss" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many fans are in the scene?", "response_gt": [ "1", "1 fans", "I see 1", "I see 1 fans", "I can count 1 fans" ], "response_pred": "2 fans" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "1", "1 keyboards", "I see 1", "I see 1 keyboards", "I can count 1 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many mouses are in the scene?", "response_gt": [ "1", "1 mouses", "I see 1", "I see 1 mouses", "I can count 1 mouses" ], "response_pred": "2 mouses" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many speakers are in the scene?", "response_gt": [ "2", "2 speakers", "I see 2", "I see 2 speakers", "I can count 2 speakers" ], "response_pred": "2 speakers" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many monitor stands are in the scene?", "response_gt": [ "1", "1 monitor stands", "I see 1", "I see 1 monitor stands", "I can count 1 monitor stands" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many objectss are in the scene?", "response_gt": [ "1", "1 objectss", "I see 1", "I see 1 objectss", "I can count 1 objectss" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "1", "1 smoke detectors", "I see 1", "I see 1 smoke detectors", "I can count 1 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "a24f64f7fb", "instruction": " USER: How many sit-up pillows are in the scene?", "response_gt": [ "1", "1 sit-up pillows", "I see 1", "I see 1 sit-up pillows", "I can count 1 sit-up pillows" ], "response_pred": "2 sit-up pillows" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many power socketss are in the scene?", "response_gt": [ "1", "1 power socketss", "I see 1", "I see 1 power socketss", "I can count 1 power socketss" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many buckets are in the scene?", "response_gt": [ "2", "2 buckets", "I see 2", "I see 2 buckets", "I can count 2 buckets" ], "response_pred": "3 buckets" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "2", "2 whiteboards", "I see 2", "I see 2 whiteboards", "I can count 2 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many ceiling beams are in the scene?", "response_gt": [ "1", "1 ceiling beams", "I see 1", "I see 1 ceiling beams", "I can count 1 ceiling beams" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many desks are in the scene?", "response_gt": [ "3", "3 desks", "I see 3", "I see 3 desks", "I can count 3 desks" ], "response_pred": "4 desks" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many electrical ducts are in the scene?", "response_gt": [ "3", "3 electrical ducts", "I see 3", "I see 3 electrical ducts", "I can count 3 electrical ducts" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "1", "1 chairs", "I see 1", "I see 1 chairs", "I can count 1 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many office chairs are in the scene?", "response_gt": [ "3", "3 office chairs", "I see 3", "I see 3 office chairs", "I can count 3 office chairs" ], "response_pred": "4 office chairs" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "1", "1 heaters", "I see 1", "I see 1 heaters", "I can count 1 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many ceiling lights are in the scene?", "response_gt": [ "4", "4 ceiling lights", "I see 4", "I see 4 ceiling lights", "I can count 4 ceiling lights" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many window sills are in the scene?", "response_gt": [ "1", "1 window sills", "I see 1", "I see 1 window sills", "I can count 1 window sills" ], "response_pred": "2 window sills" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many storage cabinets are in the scene?", "response_gt": [ "2", "2 storage cabinets", "I see 2", "I see 2 storage cabinets", "I can count 2 storage cabinets" ], "response_pred": "3 storage cabinets" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "3", "3 keyboards", "I see 3", "I see 3 keyboards", "I can count 3 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many standing fans are in the scene?", "response_gt": [ "1", "1 standing fans", "I see 1", "I see 1 standing fans", "I can count 1 standing fans" ], "response_pred": "2 fans" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "4", "4 monitors", "I see 4", "I see 4 monitors", "I can count 4 monitors" ], "response_pred": "2 monitors" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many blindss are in the scene?", "response_gt": [ "1", "1 blindss", "I see 1", "I see 1 blindss", "I can count 1 blindss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many computer towers are in the scene?", "response_gt": [ "1", "1 computer towers", "I see 1", "I see 1 computer towers", "I can count 1 computer towers" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many telephones are in the scene?", "response_gt": [ "2", "2 telephones", "I see 2", "I see 2 telephones", "I can count 2 telephones" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "2", "2 objects", "I see 2", "I see 2 objects", "I can count 2 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "2", "2 bottles", "I see 2", "I see 2 bottles", "I can count 2 bottles" ], "response_pred": "6 bottles" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many mouses are in the scene?", "response_gt": [ "4", "4 mouses", "I see 4", "I see 4 mouses", "I can count 4 mouses" ], "response_pred": "2 mouses" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many microphones are in the scene?", "response_gt": [ "1", "1 microphones", "I see 1", "I see 1 microphones", "I can count 1 microphones" ], "response_pred": "2 microphones" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many clothes hangers are in the scene?", "response_gt": [ "1", "1 clothes hangers", "I see 1", "I see 1 clothes hangers", "I can count 1 clothes hangers" ], "response_pred": "2 clothes hangers" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "1ada7a0617", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "1", "1 cabinets", "I see 1", "I see 1 cabinets", "I can count 1 cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many rolling chairs are in the scene?", "response_gt": [ "2", "2 rolling chairs", "I see 2", "I see 2 rolling chairs", "I can count 2 rolling chairs" ], "response_pred": "2 rolling chairs" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "21", "21 objects", "I see 21", "I see 21 objects", "I can count 21 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many power sockers are in the scene?", "response_gt": [ "1", "1 power sockers", "I see 1", "I see 1 power sockers", "I can count 1 power sockers" ], "response_pred": "2 power socks" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "3", "3 whiteboards", "I see 3", "I see 3 whiteboards", "I can count 3 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "1", "1 bottles", "I see 1", "I see 1 bottles", "I can count 1 bottles" ], "response_pred": "4 bottles" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "2", "2 tables", "I see 2", "I see 2 tables", "I can count 2 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many footstools are in the scene?", "response_gt": [ "1", "1 footstools", "I see 1", "I see 1 footstools", "I can count 1 footstools" ], "response_pred": "3 footstools" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many bookshelfs are in the scene?", "response_gt": [ "1", "1 bookshelfs", "I see 1", "I see 1 bookshelfs", "I can count 1 bookshelfs" ], "response_pred": "3 bookshelf" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many pictures are in the scene?", "response_gt": [ "3", "3 pictures", "I see 3", "I see 3 pictures", "I can count 3 pictures" ], "response_pred": "3 pictures" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "1", "1 cabinets", "I see 1", "I see 1 cabinets", "I can count 1 cabinets" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "6", "6 monitors", "I see 6", "I see 6 monitors", "I can count 6 monitors" ], "response_pred": "4 monitors" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many mouses are in the scene?", "response_gt": [ "2", "2 mouses", "I see 2", "I see 2 mouses", "I can count 2 mouses" ], "response_pred": "2 mouses" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many computer towers are in the scene?", "response_gt": [ "1", "1 computer towers", "I see 1", "I see 1 computer towers", "I can count 1 computer towers" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "3", "3 trash cans", "I see 3", "I see 3 trash cans", "I can count 3 trash cans" ], "response_pred": "4 trash cans" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many post its are in the scene?", "response_gt": [ "1", "1 post its", "I see 1", "I see 1 post its", "I can count 1 post its" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many ceiling lights are in the scene?", "response_gt": [ "6", "6 ceiling lights", "I see 6", "I see 6 ceiling lights", "I can count 6 ceiling lights" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "1", "1 boxs", "I see 1", "I see 1 boxs", "I can count 1 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many laptops are in the scene?", "response_gt": [ "2", "2 laptops", "I see 2", "I see 2 laptops", "I can count 2 laptops" ], "response_pred": "2 laptops" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many headsets are in the scene?", "response_gt": [ "1", "1 headsets", "I see 1", "I see 1 headsets", "I can count 1 headsets" ], "response_pred": "2 headsets" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many panels are in the scene?", "response_gt": [ "1", "1 panels", "I see 1", "I see 1 panels", "I can count 1 panels" ], "response_pred": "4 panels" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many monitor holders are in the scene?", "response_gt": [ "1", "1 monitor holders", "I see 1", "I see 1 monitor holders", "I can count 1 monitor holders" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "1", "1 keyboards", "I see 1", "I see 1 keyboards", "I can count 1 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many papers are in the scene?", "response_gt": [ "1", "1 papers", "I see 1", "I see 1 papers", "I can count 1 papers" ], "response_pred": "3 papers" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many structures are in the scene?", "response_gt": [ "1", "1 structures", "I see 1", "I see 1 structures", "I can count 1 structures" ], "response_pred": "3 structures" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "1", "1 heaters", "I see 1", "I see 1 heaters", "I can count 1 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many air conditioners are in the scene?", "response_gt": [ "1", "1 air conditioners", "I see 1", "I see 1 air conditioners", "I can count 1 air conditioners" ], "response_pred": "2 air conditioners" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many telephones are in the scene?", "response_gt": [ "1", "1 telephones", "I see 1", "I see 1 telephones", "I can count 1 telephones" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5eb31827b7", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "2", "2 windows", "I see 2", "I see 2 windows", "I can count 2 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "3", "3 cabinets", "I see 3", "I see 3 cabinets", "I can count 3 cabinets" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many coat hangers are in the scene?", "response_gt": [ "1", "1 coat hangers", "I see 1", "I see 1 coat hangers", "I can count 1 coat hangers" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many plates are in the scene?", "response_gt": [ "1", "1 plates", "I see 1", "I see 1 plates", "I can count 1 plates" ], "response_pred": "2 plates" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many paper bags are in the scene?", "response_gt": [ "1", "1 paper bags", "I see 1", "I see 1 paper bags", "I can count 1 paper bags" ], "response_pred": "2 paper bags" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "1", "1 bottles", "I see 1", "I see 1 bottles", "I can count 1 bottles" ], "response_pred": "4 bottles" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "3", "3 windows", "I see 3", "I see 3 windows", "I can count 3 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many printers are in the scene?", "response_gt": [ "1", "1 printers", "I see 1", "I see 1 printers", "I can count 1 printers" ], "response_pred": "3 printers" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many office chairs are in the scene?", "response_gt": [ "2", "2 office chairs", "I see 2", "I see 2 office chairs", "I can count 2 office chairs" ], "response_pred": "4 office chairs" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "3", "3 chairs", "I see 3", "I see 3 chairs", "I can count 3 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many objectss are in the scene?", "response_gt": [ "2", "2 objectss", "I see 2", "I see 2 objectss", "I can count 2 objectss" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many splits are in the scene?", "response_gt": [ "1", "1 splits", "I see 1", "I see 1 splits", "I can count 1 splits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many coffee machines are in the scene?", "response_gt": [ "1", "1 coffee machines", "I see 1", "I see 1 coffee machines", "I can count 1 coffee machines" ], "response_pred": "2 coffee machines" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "2", "2 whiteboards", "I see 2", "I see 2 whiteboards", "I can count 2 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many phones are in the scene?", "response_gt": [ "2", "2 phones", "I see 2", "I see 2 phones", "I can count 2 phones" ], "response_pred": "3 phones" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many decorations are in the scene?", "response_gt": [ "1", "1 decorations", "I see 1", "I see 1 decorations", "I can count 1 decorations" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many whiteboard erasers are in the scene?", "response_gt": [ "2", "2 whiteboard erasers", "I see 2", "I see 2 whiteboard erasers", "I can count 2 whiteboard erasers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many papers are in the scene?", "response_gt": [ "1", "1 papers", "I see 1", "I see 1 papers", "I can count 1 papers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "2", "2 trash cans", "I see 2", "I see 2 trash cans", "I can count 2 trash cans" ], "response_pred": "4 trash cans" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many storage cabinets are in the scene?", "response_gt": [ "2", "2 storage cabinets", "I see 2", "I see 2 storage cabinets", "I can count 2 storage cabinets" ], "response_pred": "3 storage cabinets" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "15", "15 objects", "I see 15", "I see 15 objects", "I can count 15 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many pedestal fans are in the scene?", "response_gt": [ "1", "1 pedestal fans", "I see 1", "I see 1 pedestal fans", "I can count 1 pedestal fans" ], "response_pred": "2 fans" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many sockets are in the scene?", "response_gt": [ "1", "1 sockets", "I see 1", "I see 1 sockets", "I can count 1 sockets" ], "response_pred": "2 sockets" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "4", "4 monitors", "I see 4", "I see 4 monitors", "I can count 4 monitors" ], "response_pred": "2 monitors" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "2", "2 keyboards", "I see 2", "I see 2 keyboards", "I can count 2 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "2", "2 shelfs", "I see 2", "I see 2 shelfs", "I can count 2 shelfs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many frames are in the scene?", "response_gt": [ "1", "1 frames", "I see 1", "I see 1 frames", "I can count 1 frames" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many electrical ducts are in the scene?", "response_gt": [ "2", "2 electrical ducts", "I see 2", "I see 2 electrical ducts", "I can count 2 electrical ducts" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "3", "3 tables", "I see 3", "I see 3 tables", "I can count 3 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many blindss are in the scene?", "response_gt": [ "1", "1 blindss", "I see 1", "I see 1 blindss", "I can count 1 blindss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many headphone cases are in the scene?", "response_gt": [ "1", "1 headphone cases", "I see 1", "I see 1 headphone cases", "I can count 1 headphone cases" ], "response_pred": "2 headphone cases" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many plushies are in the scene?", "response_gt": [ "1", "1 plushies", "I see 1", "I see 1 plushies", "I can count 1 plushies" ], "response_pred": "2 plushies" }, { "source": "scannet", "scene_id": "3e8bba0176", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "4", "4 ceiling lamps", "I see 4", "I see 4 ceiling lamps", "I can count 4 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "1", "1 cabinets", "I see 1", "I see 1 cabinets", "I can count 1 cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "4", "4 chairs", "I see 4", "I see 4 chairs", "I can count 4 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many paintings are in the scene?", "response_gt": [ "1", "1 paintings", "I see 1", "I see 1 paintings", "I can count 1 paintings" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "1", "1 windows", "I see 1", "I see 1 windows", "I can count 1 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many bookshelfs are in the scene?", "response_gt": [ "2", "2 bookshelfs", "I see 2", "I see 2 bookshelfs", "I can count 2 bookshelfs" ], "response_pred": "4 bookshelf" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many pictures are in the scene?", "response_gt": [ "5", "5 pictures", "I see 5", "I see 5 pictures", "I can count 5 pictures" ], "response_pred": "3 pictures" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many wall clocks are in the scene?", "response_gt": [ "1", "1 wall clocks", "I see 1", "I see 1 wall clocks", "I can count 1 wall clocks" ], "response_pred": "2 wall clocks" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many desks are in the scene?", "response_gt": [ "1", "1 desks", "I see 1", "I see 1 desks", "I can count 1 desks" ], "response_pred": "3 desks" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "2", "2 ceiling lamps", "I see 2", "I see 2 ceiling lamps", "I can count 2 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many exhaust fans are in the scene?", "response_gt": [ "1", "1 exhaust fans", "I see 1", "I see 1 exhaust fans", "I can count 1 exhaust fans" ], "response_pred": "2 exhaust fans" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many books are in the scene?", "response_gt": [ "202", "202 books", "I see 202", "I see 202 books", "I can count 202 books" ], "response_pred": "3 books" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many filess are in the scene?", "response_gt": [ "6", "6 filess", "I see 6", "I see 6 filess", "I can count 6 filess" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "9", "9 boxs", "I see 9", "I see 9 boxs", "I can count 9 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many paper towels are in the scene?", "response_gt": [ "1", "1 paper towels", "I see 1", "I see 1 paper towels", "I can count 1 paper towels" ], "response_pred": "4 paper towels" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many bags are in the scene?", "response_gt": [ "1", "1 bags", "I see 1", "I see 1 bags", "I can count 1 bags" ], "response_pred": "2 bags" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "2", "2 trash bins", "I see 2", "I see 2 trash bins", "I can count 2 trash bins" ], "response_pred": "4 trash bins" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many safe boxs are in the scene?", "response_gt": [ "1", "1 safe boxs", "I see 1", "I see 1 safe boxs", "I can count 1 safe boxs" ], "response_pred": "3 safe boxes" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many blindss are in the scene?", "response_gt": [ "1", "1 blindss", "I see 1", "I see 1 blindss", "I can count 1 blindss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many files are in the scene?", "response_gt": [ "11", "11 files", "I see 11", "I see 11 files", "I can count 11 files" ], "response_pred": "3 files" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many cups are in the scene?", "response_gt": [ "3", "3 cups", "I see 3", "I see 3 cups", "I can count 3 cups" ], "response_pred": "2 cups" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many folderss are in the scene?", "response_gt": [ "1", "1 folderss", "I see 1", "I see 1 folderss", "I can count 1 folderss" ], "response_pred": "2 folders" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many folders are in the scene?", "response_gt": [ "1", "1 folders", "I see 1", "I see 1 folders", "I can count 1 folders" ], "response_pred": "2 folders" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many telephones are in the scene?", "response_gt": [ "1", "1 telephones", "I see 1", "I see 1 telephones", "I can count 1 telephones" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many pencil holders are in the scene?", "response_gt": [ "1", "1 pencil holders", "I see 1", "I see 1 pencil holders", "I can count 1 pencil holders" ], "response_pred": "2 pencil holders" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many mouses are in the scene?", "response_gt": [ "1", "1 mouses", "I see 1", "I see 1 mouses", "I can count 1 mouses" ], "response_pred": "2 mouses" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many mouse pads are in the scene?", "response_gt": [ "1", "1 mouse pads", "I see 1", "I see 1 mouse pads", "I can count 1 mouse pads" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "1", "1 keyboards", "I see 1", "I see 1 keyboards", "I can count 1 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "3f15a9266d", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "2", "2 monitors", "I see 2", "I see 2 monitors", "I can count 2 monitors" ], "response_pred": "2 monitors" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "3", "3 heaters", "I see 3", "I see 3 heaters", "I can count 3 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many splits are in the scene?", "response_gt": [ "7", "7 splits", "I see 7", "I see 7 splits", "I can count 7 splits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many window sills are in the scene?", "response_gt": [ "2", "2 window sills", "I see 2", "I see 2 window sills", "I can count 2 window sills" ], "response_pred": "2 window sills" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many structures are in the scene?", "response_gt": [ "2", "2 structures", "I see 2", "I see 2 structures", "I can count 2 structures" ], "response_pred": "3 structures" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many blindss are in the scene?", "response_gt": [ "2", "2 blindss", "I see 2", "I see 2 blindss", "I can count 2 blindss" ], "response_pred": "2 blindss" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many removes are in the scene?", "response_gt": [ "2", "2 removes", "I see 2", "I see 2 removes", "I can count 2 removes" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "3", "3 whiteboards", "I see 3", "I see 3 whiteboards", "I can count 3 whiteboards" ], "response_pred": "2 whiteboards" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "40", "40 chairs", "I see 40", "I see 40 chairs", "I can count 40 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "2", "2 windows", "I see 2", "I see 2 windows", "I can count 2 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many tvs are in the scene?", "response_gt": [ "1", "1 tvs", "I see 1", "I see 1 tvs", "I can count 1 tvs" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many whiteboard erasers are in the scene?", "response_gt": [ "1", "1 whiteboard erasers", "I see 1", "I see 1 whiteboard erasers", "I can count 1 whiteboard erasers" ], "response_pred": "2 whiteboard erasers" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "7", "7 tables", "I see 7", "I see 7 tables", "I can count 7 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "1", "1 sinks", "I see 1", "I see 1 sinks", "I can count 1 sinks" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many curtain rails are in the scene?", "response_gt": [ "1", "1 curtain rails", "I see 1", "I see 1 curtain rails", "I can count 1 curtain rails" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many lans are in the scene?", "response_gt": [ "2", "2 lans", "I see 2", "I see 2 lans", "I can count 2 lans" ], "response_pred": "2 lans" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many electrical ducts are in the scene?", "response_gt": [ "4", "4 electrical ducts", "I see 4", "I see 4 electrical ducts", "I can count 4 electrical ducts" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many plugs are in the scene?", "response_gt": [ "2", "2 plugs", "I see 2", "I see 2 plugs", "I can count 2 plugs" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many switchs are in the scene?", "response_gt": [ "2", "2 switchs", "I see 2", "I see 2 switchs", "I can count 2 switchs" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "8", "8 objects", "I see 8", "I see 8 objects", "I can count 8 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many telephones are in the scene?", "response_gt": [ "1", "1 telephones", "I see 1", "I see 1 telephones", "I can count 1 telephones" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many hand soap dispensers are in the scene?", "response_gt": [ "1", "1 hand soap dispensers", "I see 1", "I see 1 hand soap dispensers", "I can count 1 hand soap dispensers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many paper towel dispensers are in the scene?", "response_gt": [ "1", "1 paper towel dispensers", "I see 1", "I see 1 paper towel dispensers", "I can count 1 paper towel dispensers" ], "response_pred": "2 paper towel dispensers" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "1", "1 trash bins", "I see 1", "I see 1 trash bins", "I can count 1 trash bins" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many wall clocks are in the scene?", "response_gt": [ "1", "1 wall clocks", "I see 1", "I see 1 wall clocks", "I can count 1 wall clocks" ], "response_pred": "2 wall clocks" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many projectors are in the scene?", "response_gt": [ "1", "1 projectors", "I see 1", "I see 1 projectors", "I can count 1 projectors" ], "response_pred": "2 projectors" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "12", "12 ceiling lamps", "I see 12", "I see 12 ceiling lamps", "I can count 12 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "21d970d8de", "instruction": " USER: How many speakers are in the scene?", "response_gt": [ "2", "2 speakers", "I see 2", "I see 2 speakers", "I can count 2 speakers" ], "response_pred": "2 speakers" }, { "source": "scannet", "scene_id": "5748ce6f01", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "2", "2 whiteboards", "I see 2", "I see 2 whiteboards", "I can count 2 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5748ce6f01", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "2", "2 ceiling lamps", "I see 2", "I see 2 ceiling lamps", "I can count 2 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5748ce6f01", "instruction": " USER: How many air vents are in the scene?", "response_gt": [ "1", "1 air vents", "I see 1", "I see 1 air vents", "I can count 1 air vents" ], "response_pred": "2 air vents" }, { "source": "scannet", "scene_id": "5748ce6f01", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "1", "1 smoke detectors", "I see 1", "I see 1 smoke detectors", "I can count 1 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5748ce6f01", "instruction": " USER: How many cups are in the scene?", "response_gt": [ "1", "1 cups", "I see 1", "I see 1 cups", "I can count 1 cups" ], "response_pred": "2 cups" }, { "source": "scannet", "scene_id": "5748ce6f01", "instruction": " USER: How many office chairs are in the scene?", "response_gt": [ "5", "5 office chairs", "I see 5", "I see 5 office chairs", "I can count 5 office chairs" ], "response_pred": "4 office chairs" }, { "source": "scannet", "scene_id": "5748ce6f01", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "2", "2 doors", "I see 2", "I see 2 doors", "I can count 2 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "5748ce6f01", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "1", "1 tables", "I see 1", "I see 1 tables", "I can count 1 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "5748ce6f01", "instruction": " USER: How many switchs are in the scene?", "response_gt": [ "2", "2 switchs", "I see 2", "I see 2 switchs", "I can count 2 switchs" ], "response_pred": "1 switch" }, { "source": "scannet", "scene_id": "5748ce6f01", "instruction": " USER: How many flat panel displays are in the scene?", "response_gt": [ "1", "1 flat panel displays", "I see 1", "I see 1 flat panel displays", "I can count 1 flat panel displays" ], "response_pred": "2 flat panel displays" }, { "source": "scannet", "scene_id": "5748ce6f01", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "1", "1 objects", "I see 1", "I see 1 objects", "I can count 1 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "5748ce6f01", "instruction": " USER: How many overhead projectors are in the scene?", "response_gt": [ "1", "1 overhead projectors", "I see 1", "I see 1 overhead projectors", "I can count 1 overhead projectors" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "5748ce6f01", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "1", "1 windows", "I see 1", "I see 1 windows", "I can count 1 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many projectors are in the scene?", "response_gt": [ "1", "1 projectors", "I see 1", "I see 1 projectors", "I can count 1 projectors" ], "response_pred": "2 projectors" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many telephones are in the scene?", "response_gt": [ "1", "1 telephones", "I see 1", "I see 1 telephones", "I can count 1 telephones" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many speakers are in the scene?", "response_gt": [ "1", "1 speakers", "I see 1", "I see 1 speakers", "I can count 1 speakers" ], "response_pred": "2 speakers" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many wall clocks are in the scene?", "response_gt": [ "1", "1 wall clocks", "I see 1", "I see 1 wall clocks", "I can count 1 wall clocks" ], "response_pred": "2 wall clocks" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many blackboards are in the scene?", "response_gt": [ "4", "4 blackboards", "I see 4", "I see 4 blackboards", "I can count 4 blackboards" ], "response_pred": "2 blackboards" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "6", "6 ceiling lamps", "I see 6", "I see 6 ceiling lamps", "I can count 6 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many exhaust fans are in the scene?", "response_gt": [ "1", "1 exhaust fans", "I see 1", "I see 1 exhaust fans", "I can count 1 exhaust fans" ], "response_pred": "2 exhaust fans" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "1", "1 sinks", "I see 1", "I see 1 sinks", "I can count 1 sinks" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "3", "3 doors", "I see 3", "I see 3 doors", "I can count 3 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "14", "14 tables", "I see 14", "I see 14 tables", "I can count 14 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many rolling carts are in the scene?", "response_gt": [ "1", "1 rolling carts", "I see 1", "I see 1 rolling carts", "I can count 1 rolling carts" ], "response_pred": "2 rolling carts" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many splits are in the scene?", "response_gt": [ "2", "2 splits", "I see 2", "I see 2 splits", "I can count 2 splits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "36", "36 chairs", "I see 36", "I see 36 chairs", "I can count 36 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many overhead projectors are in the scene?", "response_gt": [ "1", "1 overhead projectors", "I see 1", "I see 1 overhead projectors", "I can count 1 overhead projectors" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many toilet paper dispensers are in the scene?", "response_gt": [ "1", "1 toilet paper dispensers", "I see 1", "I see 1 toilet paper dispensers", "I can count 1 toilet paper dispensers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "2", "2 trash bins", "I see 2", "I see 2 trash bins", "I can count 2 trash bins" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many blackboard frames are in the scene?", "response_gt": [ "1", "1 blackboard frames", "I see 1", "I see 1 blackboard frames", "I can count 1 blackboard frames" ], "response_pred": "2 blackboard frames" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many electrical ducts are in the scene?", "response_gt": [ "1", "1 electrical ducts", "I see 1", "I see 1 electrical ducts", "I can count 1 electrical ducts" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c4c04e6d6c", "instruction": " USER: How many pillars are in the scene?", "response_gt": [ "2", "2 pillars", "I see 2", "I see 2 pillars", "I can count 2 pillars" ], "response_pred": "2 pillars" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many removes are in the scene?", "response_gt": [ "1", "1 removes", "I see 1", "I see 1 removes", "I can count 1 removes" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "7", "7 objects", "I see 7", "I see 7 objects", "I can count 7 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "4", "4 doors", "I see 4", "I see 4 doors", "I can count 4 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many floor lamps are in the scene?", "response_gt": [ "2", "2 floor lamps", "I see 2", "I see 2 floor lamps", "I can count 2 floor lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many beanbags are in the scene?", "response_gt": [ "5", "5 beanbags", "I see 5", "I see 5 beanbags", "I can count 5 beanbags" ], "response_pred": "2 beanbags" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "1", "1 chairs", "I see 1", "I see 1 chairs", "I can count 1 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many office chairs are in the scene?", "response_gt": [ "1", "1 office chairs", "I see 1", "I see 1 office chairs", "I can count 1 office chairs" ], "response_pred": "4 office chairs" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "3", "3 cabinets", "I see 3", "I see 3 cabinets", "I can count 3 cabinets" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many structures are in the scene?", "response_gt": [ "3", "3 structures", "I see 3", "I see 3 structures", "I can count 3 structures" ], "response_pred": "3 structures" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many sofas are in the scene?", "response_gt": [ "1", "1 sofas", "I see 1", "I see 1 sofas", "I can count 1 sofas" ], "response_pred": "2 sofas" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many stools are in the scene?", "response_gt": [ "5", "5 stools", "I see 5", "I see 5 stools", "I can count 5 stools" ], "response_pred": "2 stools" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "5", "5 boxs", "I see 5", "I see 5 boxs", "I can count 5 boxs" ], "response_pred": "4 boxes" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many stoolss are in the scene?", "response_gt": [ "1", "1 stoolss", "I see 1", "I see 1 stoolss", "I can count 1 stoolss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many fire extinguishers are in the scene?", "response_gt": [ "1", "1 fire extinguishers", "I see 1", "I see 1 fire extinguishers", "I can count 1 fire extinguishers" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many benchs are in the scene?", "response_gt": [ "4", "4 benchs", "I see 4", "I see 4 benchs", "I can count 4 benchs" ], "response_pred": "3 benches" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "1", "1 shelfs", "I see 1", "I see 1 shelfs", "I can count 1 shelfs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many projectors are in the scene?", "response_gt": [ "1", "1 projectors", "I see 1", "I see 1 projectors", "I can count 1 projectors" ], "response_pred": "2 projectors" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "1", "1 smoke detectors", "I see 1", "I see 1 smoke detectors", "I can count 1 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many cord covers are in the scene?", "response_gt": [ "1", "1 cord covers", "I see 1", "I see 1 cord covers", "I can count 1 cord covers" ], "response_pred": "2 cord covers" }, { "source": "scannet", "scene_id": "7831862f02", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "3", "3 ceiling lamps", "I see 3", "I see 3 ceiling lamps", "I can count 3 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many fake ceilings are in the scene?", "response_gt": [ "2", "2 fake ceilings", "I see 2", "I see 2 fake ceilings", "I can count 2 fake ceilings" ], "response_pred": "2 fake ceilings" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many projectors are in the scene?", "response_gt": [ "1", "1 projectors", "I see 1", "I see 1 projectors", "I can count 1 projectors" ], "response_pred": "2 projectors" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "8", "8 ceiling lamps", "I see 8", "I see 8 ceiling lamps", "I can count 8 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many pillars are in the scene?", "response_gt": [ "1", "1 pillars", "I see 1", "I see 1 pillars", "I can count 1 pillars" ], "response_pred": "2 pillars" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many blindss are in the scene?", "response_gt": [ "2", "2 blindss", "I see 2", "I see 2 blindss", "I can count 2 blindss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "3", "3 heaters", "I see 3", "I see 3 heaters", "I can count 3 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many electrical ducts are in the scene?", "response_gt": [ "4", "4 electrical ducts", "I see 4", "I see 4 electrical ducts", "I can count 4 electrical ducts" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "47", "47 chairs", "I see 47", "I see 47 chairs", "I can count 47 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "9", "9 tables", "I see 9", "I see 9 tables", "I can count 9 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "1", "1 objects", "I see 1", "I see 1 objects", "I can count 1 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many routers are in the scene?", "response_gt": [ "1", "1 routers", "I see 1", "I see 1 routers", "I can count 1 routers" ], "response_pred": "2 routers" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many clocks are in the scene?", "response_gt": [ "1", "1 clocks", "I see 1", "I see 1 clocks", "I can count 1 clocks" ], "response_pred": "2 clocks" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "3", "3 whiteboards", "I see 3", "I see 3 whiteboards", "I can count 3 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many soap dispensers are in the scene?", "response_gt": [ "1", "1 soap dispensers", "I see 1", "I see 1 soap dispensers", "I can count 1 soap dispensers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "1", "1 sinks", "I see 1", "I see 1 sinks", "I can count 1 sinks" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many paper towel dispensers are in the scene?", "response_gt": [ "1", "1 paper towel dispensers", "I see 1", "I see 1 paper towel dispensers", "I can count 1 paper towel dispensers" ], "response_pred": "2 paper towel dispensers" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many whiteboard erasers are in the scene?", "response_gt": [ "1", "1 whiteboard erasers", "I see 1", "I see 1 whiteboard erasers", "I can count 1 whiteboard erasers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "1", "1 trash bins", "I see 1", "I see 1 trash bins", "I can count 1 trash bins" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many whiteboard stands are in the scene?", "response_gt": [ "2", "2 whiteboard stands", "I see 2", "I see 2 whiteboard stands", "I can count 2 whiteboard stands" ], "response_pred": "2 whiteboards" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many projector holders are in the scene?", "response_gt": [ "1", "1 projector holders", "I see 1", "I see 1 projector holders", "I can count 1 projector holders" ], "response_pred": "2 projector holders" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "2", "2 doors", "I see 2", "I see 2 doors", "I can count 2 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "3", "3 windows", "I see 3", "I see 3 windows", "I can count 3 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many power strips are in the scene?", "response_gt": [ "2", "2 power strips", "I see 2", "I see 2 power strips", "I can count 2 power strips" ], "response_pred": "three" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many celing lamps are in the scene?", "response_gt": [ "2", "2 celing lamps", "I see 2", "I see 2 celing lamps", "I can count 2 celing lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many wall outlets are in the scene?", "response_gt": [ "1", "1 wall outlets", "I see 1", "I see 1 wall outlets", "I can count 1 wall outlets" ], "response_pred": "2 wall outlets" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many projector screens are in the scene?", "response_gt": [ "1", "1 projector screens", "I see 1", "I see 1 projector screens", "I can count 1 projector screens" ], "response_pred": "2 projector screens" }, { "source": "scannet", "scene_id": "bde1e479ad", "instruction": " USER: How many joined tabless are in the scene?", "response_gt": [ "6", "6 joined tabless", "I see 6", "I see 6 joined tabless", "I can count 6 joined tabless" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "36", "36 chairs", "I see 36", "I see 36 chairs", "I can count 36 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many overhead projectors are in the scene?", "response_gt": [ "1", "1 overhead projectors", "I see 1", "I see 1 overhead projectors", "I can count 1 overhead projectors" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many removes are in the scene?", "response_gt": [ "2", "2 removes", "I see 2", "I see 2 removes", "I can count 2 removes" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "8", "8 tables", "I see 8", "I see 8 tables", "I can count 8 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "10", "10 ceiling lamps", "I see 10", "I see 10 ceiling lamps", "I can count 10 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "2", "2 doors", "I see 2", "I see 2 doors", "I can count 2 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many projectors are in the scene?", "response_gt": [ "2", "2 projectors", "I see 2", "I see 2 projectors", "I can count 2 projectors" ], "response_pred": "2 projectors" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "3", "3 heaters", "I see 3", "I see 3 heaters", "I can count 3 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many electrical ducts are in the scene?", "response_gt": [ "4", "4 electrical ducts", "I see 4", "I see 4 electrical ducts", "I can count 4 electrical ducts" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many fake ceilings are in the scene?", "response_gt": [ "1", "1 fake ceilings", "I see 1", "I see 1 fake ceilings", "I can count 1 fake ceilings" ], "response_pred": "2 fake ceilings" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many soap dispensers are in the scene?", "response_gt": [ "1", "1 soap dispensers", "I see 1", "I see 1 soap dispensers", "I can count 1 soap dispensers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "1", "1 sinks", "I see 1", "I see 1 sinks", "I can count 1 sinks" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "1", "1 trash cans", "I see 1", "I see 1 trash cans", "I can count 1 trash cans" ], "response_pred": "4 trash cans" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many blindss are in the scene?", "response_gt": [ "1", "1 blindss", "I see 1", "I see 1 blindss", "I can count 1 blindss" ], "response_pred": "2 blindss" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "4", "4 windows", "I see 4", "I see 4 windows", "I can count 4 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many telephones are in the scene?", "response_gt": [ "1", "1 telephones", "I see 1", "I see 1 telephones", "I can count 1 telephones" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "1", "1 keyboards", "I see 1", "I see 1 keyboards", "I can count 1 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many remote controllers are in the scene?", "response_gt": [ "2", "2 remote controllers", "I see 2", "I see 2 remote controllers", "I can count 2 remote controllers" ], "response_pred": "4 remotes" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many blind rails are in the scene?", "response_gt": [ "1", "1 blind rails", "I see 1", "I see 1 blind rails", "I can count 1 blind rails" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "1", "1 whiteboards", "I see 1", "I see 1 whiteboards", "I can count 1 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many coat hangers are in the scene?", "response_gt": [ "2", "2 coat hangers", "I see 2", "I see 2 coat hangers", "I can count 2 coat hangers" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many counters are in the scene?", "response_gt": [ "1", "1 counters", "I see 1", "I see 1 counters", "I can count 1 counters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many window sills are in the scene?", "response_gt": [ "2", "2 window sills", "I see 2", "I see 2 window sills", "I can count 2 window sills" ], "response_pred": "2 window sills" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "1", "1 objects", "I see 1", "I see 1 objects", "I can count 1 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "38d58a7a31", "instruction": " USER: How many joined tabless are in the scene?", "response_gt": [ "3", "3 joined tabless", "I see 3", "I see 3 joined tabless", "I can count 3 joined tabless" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5ee7c22ba0", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "9", "9 boxs", "I see 9", "I see 9 boxs", "I can count 9 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "5ee7c22ba0", "instruction": " USER: How many calendars are in the scene?", "response_gt": [ "2", "2 calendars", "I see 2", "I see 2 calendars", "I can count 2 calendars" ], "response_pred": "2 calendars" }, { "source": "scannet", "scene_id": "5ee7c22ba0", "instruction": " USER: How many microwaves are in the scene?", "response_gt": [ "1", "1 microwaves", "I see 1", "I see 1 microwaves", "I can count 1 microwaves" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5ee7c22ba0", "instruction": " USER: How many books are in the scene?", "response_gt": [ "1", "1 books", "I see 1", "I see 1 books", "I can count 1 books" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "5ee7c22ba0", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "1", "1 trash bins", "I see 1", "I see 1 trash bins", "I can count 1 trash bins" ], "response_pred": "4 trash bins" }, { "source": "scannet", "scene_id": "5ee7c22ba0", "instruction": " USER: How many bookss are in the scene?", "response_gt": [ "1", "1 bookss", "I see 1", "I see 1 bookss", "I can count 1 bookss" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "5ee7c22ba0", "instruction": " USER: How many copy machines are in the scene?", "response_gt": [ "1", "1 copy machines", "I see 1", "I see 1 copy machines", "I can count 1 copy machines" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5ee7c22ba0", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "1", "1 whiteboards", "I see 1", "I see 1 whiteboards", "I can count 1 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5ee7c22ba0", "instruction": " USER: How many bottle crates are in the scene?", "response_gt": [ "6", "6 bottle crates", "I see 6", "I see 6 bottle crates", "I can count 6 bottle crates" ], "response_pred": "4 bottle crates" }, { "source": "scannet", "scene_id": "5ee7c22ba0", "instruction": " USER: How many refrigerators are in the scene?", "response_gt": [ "1", "1 refrigerators", "I see 1", "I see 1 refrigerators", "I can count 1 refrigerators" ], "response_pred": "2 refrigerators" }, { "source": "scannet", "scene_id": "5ee7c22ba0", "instruction": " USER: How many electric boxs are in the scene?", "response_gt": [ "1", "1 electric boxs", "I see 1", "I see 1 electric boxs", "I can count 1 electric boxs" ], "response_pred": "2 electric boxes" }, { "source": "scannet", "scene_id": "5ee7c22ba0", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "1", "1 cabinets", "I see 1", "I see 1 cabinets", "I can count 1 cabinets" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "5ee7c22ba0", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "1", "1 tables", "I see 1", "I see 1 tables", "I can count 1 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "5ee7c22ba0", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "5ee7c22ba0", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "1", "1 ceiling lamps", "I see 1", "I see 1 ceiling lamps", "I can count 1 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5ee7c22ba0", "instruction": " USER: How many splits are in the scene?", "response_gt": [ "1", "1 splits", "I see 1", "I see 1 splits", "I can count 1 splits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many pan sets are in the scene?", "response_gt": [ "1", "1 pan sets", "I see 1", "I see 1 pan sets", "I can count 1 pan sets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many kitchen counters are in the scene?", "response_gt": [ "1", "1 kitchen counters", "I see 1", "I see 1 kitchen counters", "I can count 1 kitchen counters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many kitchen cabinets are in the scene?", "response_gt": [ "3", "3 kitchen cabinets", "I see 3", "I see 3 kitchen cabinets", "I can count 3 kitchen cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many dishwashers are in the scene?", "response_gt": [ "2", "2 dishwashers", "I see 2", "I see 2 dishwashers", "I can count 2 dishwashers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "1", "1 shelfs", "I see 1", "I see 1 shelfs", "I can count 1 shelfs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many stoves are in the scene?", "response_gt": [ "1", "1 stoves", "I see 1", "I see 1 stoves", "I can count 1 stoves" ], "response_pred": "2 stoves" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "1", "1 ceiling lamps", "I see 1", "I see 1 ceiling lamps", "I can count 1 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "1", "1 smoke detectors", "I see 1", "I see 1 smoke detectors", "I can count 1 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many exhaust fans are in the scene?", "response_gt": [ "2", "2 exhaust fans", "I see 2", "I see 2 exhaust fans", "I can count 2 exhaust fans" ], "response_pred": "2 exhaust fans" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "2", "2 trash bins", "I see 2", "I see 2 trash bins", "I can count 2 trash bins" ], "response_pred": "2 trash bins" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many papers are in the scene?", "response_gt": [ "4", "4 papers", "I see 4", "I see 4 papers", "I can count 4 papers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many refrigerators are in the scene?", "response_gt": [ "1", "1 refrigerators", "I see 1", "I see 1 refrigerators", "I can count 1 refrigerators" ], "response_pred": "2 refrigerators" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many plates are in the scene?", "response_gt": [ "1", "1 plates", "I see 1", "I see 1 plates", "I can count 1 plates" ], "response_pred": "2 plates" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many microwaves are in the scene?", "response_gt": [ "1", "1 microwaves", "I see 1", "I see 1 microwaves", "I can count 1 microwaves" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many kitchen towels are in the scene?", "response_gt": [ "2", "2 kitchen towels", "I see 2", "I see 2 kitchen towels", "I can count 2 kitchen towels" ], "response_pred": "2 kitchen towels" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many fire extinguishers are in the scene?", "response_gt": [ "1", "1 fire extinguishers", "I see 1", "I see 1 fire extinguishers", "I can count 1 fire extinguishers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many cups are in the scene?", "response_gt": [ "2", "2 cups", "I see 2", "I see 2 cups", "I can count 2 cups" ], "response_pred": "2 cups" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many chopping boards are in the scene?", "response_gt": [ "1", "1 chopping boards", "I see 1", "I see 1 chopping boards", "I can count 1 chopping boards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many cheeses are in the scene?", "response_gt": [ "2", "2 cheeses", "I see 2", "I see 2 cheeses", "I can count 2 cheeses" ], "response_pred": "4 cheeses" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many pinboards are in the scene?", "response_gt": [ "1", "1 pinboards", "I see 1", "I see 1 pinboards", "I can count 1 pinboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many kettles are in the scene?", "response_gt": [ "1", "1 kettles", "I see 1", "I see 1 kettles", "I can count 1 kettles" ], "response_pred": "2 kettles" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "1", "1 bottles", "I see 1", "I see 1 bottles", "I can count 1 bottles" ], "response_pred": "6 bottles" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many knife sets are in the scene?", "response_gt": [ "3", "3 knife sets", "I see 3", "I see 3 knife sets", "I can count 3 knife sets" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "1", "1 objects", "I see 1", "I see 1 objects", "I can count 1 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "f9f95681fd", "instruction": " USER: How many coffee machines are in the scene?", "response_gt": [ "1", "1 coffee machines", "I see 1", "I see 1 coffee machines", "I can count 1 coffee machines" ], "response_pred": "2 coffee machines" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "34", "34 objects", "I see 34", "I see 34 objects", "I can count 34 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many towels are in the scene?", "response_gt": [ "1", "1 towels", "I see 1", "I see 1 towels", "I can count 1 towels" ], "response_pred": "2 towels" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "1", "1 heaters", "I see 1", "I see 1 heaters", "I can count 1 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many trays are in the scene?", "response_gt": [ "1", "1 trays", "I see 1", "I see 1 trays", "I can count 1 trays" ], "response_pred": "3 trays" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many window sills are in the scene?", "response_gt": [ "1", "1 window sills", "I see 1", "I see 1 window sills", "I can count 1 window sills" ], "response_pred": "2 window sills" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "2", "2 windows", "I see 2", "I see 2 windows", "I can count 2 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "3", "3 trash cans", "I see 3", "I see 3 trash cans", "I can count 3 trash cans" ], "response_pred": "4 trash cans" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many fridges are in the scene?", "response_gt": [ "1", "1 fridges", "I see 1", "I see 1 fridges", "I can count 1 fridges" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many kitchen drawers are in the scene?", "response_gt": [ "2", "2 kitchen drawers", "I see 2", "I see 2 kitchen drawers", "I can count 2 kitchen drawers" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many dishwashers are in the scene?", "response_gt": [ "1", "1 dishwashers", "I see 1", "I see 1 dishwashers", "I can count 1 dishwashers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "1", "1 shelfs", "I see 1", "I see 1 shelfs", "I can count 1 shelfs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many sockets are in the scene?", "response_gt": [ "3", "3 sockets", "I see 3", "I see 3 sockets", "I can count 3 sockets" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many water bubblers are in the scene?", "response_gt": [ "1", "1 water bubblers", "I see 1", "I see 1 water bubblers", "I can count 1 water bubblers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many wall hooks are in the scene?", "response_gt": [ "1", "1 wall hooks", "I see 1", "I see 1 wall hooks", "I can count 1 wall hooks" ], "response_pred": "2 wall hooks" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many cups are in the scene?", "response_gt": [ "3", "3 cups", "I see 3", "I see 3 cups", "I can count 3 cups" ], "response_pred": "2 cups" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "1", "1 boxs", "I see 1", "I see 1 boxs", "I can count 1 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many containers are in the scene?", "response_gt": [ "2", "2 containers", "I see 2", "I see 2 containers", "I can count 2 containers" ], "response_pred": "6 containers" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many serving trolleys are in the scene?", "response_gt": [ "1", "1 serving trolleys", "I see 1", "I see 1 serving trolleys", "I can count 1 serving trolleys" ], "response_pred": "2 serving trolleys" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many microwaves are in the scene?", "response_gt": [ "1", "1 microwaves", "I see 1", "I see 1 microwaves", "I can count 1 microwaves" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many kitchen counters are in the scene?", "response_gt": [ "1", "1 kitchen counters", "I see 1", "I see 1 kitchen counters", "I can count 1 kitchen counters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many cabless are in the scene?", "response_gt": [ "1", "1 cabless", "I see 1", "I see 1 cabless", "I can count 1 cabless" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many papers are in the scene?", "response_gt": [ "2", "2 papers", "I see 2", "I see 2 papers", "I can count 2 papers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many kitchen cabinets are in the scene?", "response_gt": [ "1", "1 kitchen cabinets", "I see 1", "I see 1 kitchen cabinets", "I can count 1 kitchen cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "1", "1 cabinets", "I see 1", "I see 1 cabinets", "I can count 1 cabinets" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many espresso machines are in the scene?", "response_gt": [ "1", "1 espresso machines", "I see 1", "I see 1 espresso machines", "I can count 1 espresso machines" ], "response_pred": "2 espresso machines" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many utensilss are in the scene?", "response_gt": [ "5", "5 utensilss", "I see 5", "I see 5 utensilss", "I can count 5 utensilss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many coffee makers are in the scene?", "response_gt": [ "1", "1 coffee makers", "I see 1", "I see 1 coffee makers", "I can count 1 coffee makers" ], "response_pred": "2 coffee makers" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many water boilers are in the scene?", "response_gt": [ "1", "1 water boilers", "I see 1", "I see 1 water boilers", "I can count 1 water boilers" ], "response_pred": "2 water boilers" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many sink pipes are in the scene?", "response_gt": [ "1", "1 sink pipes", "I see 1", "I see 1 sink pipes", "I can count 1 sink pipes" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3864514494", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "2", "2 sinks", "I see 2", "I see 2 sinks", "I can count 2 sinks" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "9", "9 objects", "I see 9", "I see 9 objects", "I can count 9 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many pictures are in the scene?", "response_gt": [ "1", "1 pictures", "I see 1", "I see 1 pictures", "I can count 1 pictures" ], "response_pred": "3 pictures" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many kitchen cabinets are in the scene?", "response_gt": [ "6", "6 kitchen cabinets", "I see 6", "I see 6 kitchen cabinets", "I can count 6 kitchen cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many structures are in the scene?", "response_gt": [ "3", "3 structures", "I see 3", "I see 3 structures", "I can count 3 structures" ], "response_pred": "3 structures" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many dishwashers are in the scene?", "response_gt": [ "1", "1 dishwashers", "I see 1", "I see 1 dishwashers", "I can count 1 dishwashers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many reflections are in the scene?", "response_gt": [ "7", "7 reflections", "I see 7", "I see 7 reflections", "I can count 7 reflections" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many liquid soaps are in the scene?", "response_gt": [ "1", "1 liquid soaps", "I see 1", "I see 1 liquid soaps", "I can count 1 liquid soaps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many taps are in the scene?", "response_gt": [ "1", "1 taps", "I see 1", "I see 1 taps", "I can count 1 taps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many sink counters are in the scene?", "response_gt": [ "1", "1 sink counters", "I see 1", "I see 1 sink counters", "I can count 1 sink counters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many water kettles are in the scene?", "response_gt": [ "1", "1 water kettles", "I see 1", "I see 1 water kettles", "I can count 1 water kettles" ], "response_pred": "2 water kettles" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many water filters are in the scene?", "response_gt": [ "1", "1 water filters", "I see 1", "I see 1 water filters", "I can count 1 water filters" ], "response_pred": "2 water filters" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many coffee makers are in the scene?", "response_gt": [ "1", "1 coffee makers", "I see 1", "I see 1 coffee makers", "I can count 1 coffee makers" ], "response_pred": "2 coffee makers" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "3", "3 trash bins", "I see 3", "I see 3 trash bins", "I can count 3 trash bins" ], "response_pred": "4 trash bins" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many refrigerators are in the scene?", "response_gt": [ "1", "1 refrigerators", "I see 1", "I see 1 refrigerators", "I can count 1 refrigerators" ], "response_pred": "2 refrigerators" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many splits are in the scene?", "response_gt": [ "1", "1 splits", "I see 1", "I see 1 splits", "I can count 1 splits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "1", "1 boxs", "I see 1", "I see 1 boxs", "I can count 1 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many microwaves are in the scene?", "response_gt": [ "1", "1 microwaves", "I see 1", "I see 1 microwaves", "I can count 1 microwaves" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many paper towelss are in the scene?", "response_gt": [ "2", "2 paper towelss", "I see 2", "I see 2 paper towelss", "I can count 2 paper towelss" ], "response_pred": "4 paper towel dispensers" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "1", "1 sinks", "I see 1", "I see 1 sinks", "I can count 1 sinks" ], "response_pred": "2 sinks" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many kitchen counters are in the scene?", "response_gt": [ "1", "1 kitchen counters", "I see 1", "I see 1 kitchen counters", "I can count 1 kitchen counters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "1", "1 cabinets", "I see 1", "I see 1 cabinets", "I can count 1 cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many fabrics are in the scene?", "response_gt": [ "1", "1 fabrics", "I see 1", "I see 1 fabrics", "I can count 1 fabrics" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many bags are in the scene?", "response_gt": [ "1", "1 bags", "I see 1", "I see 1 bags", "I can count 1 bags" ], "response_pred": "3 bags" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "1", "1 heaters", "I see 1", "I see 1 heaters", "I can count 1 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "1", "1 windows", "I see 1", "I see 1 windows", "I can count 1 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "40aec5fffa", "instruction": " USER: How many counters are in the scene?", "response_gt": [ "1", "1 counters", "I see 1", "I see 1 counters", "I can count 1 counters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many machines are in the scene?", "response_gt": [ "1", "1 machines", "I see 1", "I see 1 machines", "I can count 1 machines" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "15", "15 objects", "I see 15", "I see 15 objects", "I can count 15 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many cable trays are in the scene?", "response_gt": [ "1", "1 cable trays", "I see 1", "I see 1 cable trays", "I can count 1 cable trays" ], "response_pred": "2 cable trays" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many wooden planks are in the scene?", "response_gt": [ "2", "2 wooden planks", "I see 2", "I see 2 wooden planks", "I can count 2 wooden planks" ], "response_pred": "2 wooden planks" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many pipes are in the scene?", "response_gt": [ "14", "14 pipes", "I see 14", "I see 14 pipes", "I can count 14 pipes" ], "response_pred": "2 pipes" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many boards are in the scene?", "response_gt": [ "6", "6 boards", "I see 6", "I see 6 boards", "I can count 6 boards" ], "response_pred": "4 boards" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many wall cord covers are in the scene?", "response_gt": [ "1", "1 wall cord covers", "I see 1", "I see 1 wall cord covers", "I can count 1 wall cord covers" ], "response_pred": "2 wall cord covers" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many cardboards are in the scene?", "response_gt": [ "4", "4 cardboards", "I see 4", "I see 4 cardboards", "I can count 4 cardboards" ], "response_pred": "4 cardboards" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many cardboxs are in the scene?", "response_gt": [ "1", "1 cardboxs", "I see 1", "I see 1 cardboxs", "I can count 1 cardboxs" ], "response_pred": "2 cardbox" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many structures are in the scene?", "response_gt": [ "1", "1 structures", "I see 1", "I see 1 structures", "I can count 1 structures" ], "response_pred": "3 structures" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many jerrycans are in the scene?", "response_gt": [ "1", "1 jerrycans", "I see 1", "I see 1 jerrycans", "I can count 1 jerrycans" ], "response_pred": "2 jerrycans" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many rodss are in the scene?", "response_gt": [ "1", "1 rodss", "I see 1", "I see 1 rodss", "I can count 1 rodss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "1", "1 heaters", "I see 1", "I see 1 heaters", "I can count 1 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many construction materialss are in the scene?", "response_gt": [ "1", "1 construction materialss", "I see 1", "I see 1 construction materialss", "I can count 1 construction materialss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many billboards are in the scene?", "response_gt": [ "1", "1 billboards", "I see 1", "I see 1 billboards", "I can count 1 billboards" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "1", "1 trash cans", "I see 1", "I see 1 trash cans", "I can count 1 trash cans" ], "response_pred": "4 trash cans" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many tubelights are in the scene?", "response_gt": [ "1", "1 tubelights", "I see 1", "I see 1 tubelights", "I can count 1 tubelights" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "13c3e046d7", "instruction": " USER: How many tube lightss are in the scene?", "response_gt": [ "1", "1 tube lightss", "I see 1", "I see 1 tube lightss", "I can count 1 tube lightss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "e398684d27", "instruction": " USER: How many cabinet doors are in the scene?", "response_gt": [ "1", "1 cabinet doors", "I see 1", "I see 1 cabinet doors", "I can count 1 cabinet doors" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "e398684d27", "instruction": " USER: How many pipes are in the scene?", "response_gt": [ "7", "7 pipes", "I see 7", "I see 7 pipes", "I can count 7 pipes" ], "response_pred": "2 pipes" }, { "source": "scannet", "scene_id": "e398684d27", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "1", "1 ceiling lamps", "I see 1", "I see 1 ceiling lamps", "I can count 1 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "e398684d27", "instruction": " USER: How many cardboard boxs are in the scene?", "response_gt": [ "23", "23 cardboard boxs", "I see 23", "I see 23 cardboard boxs", "I can count 23 cardboard boxs" ], "response_pred": "2 cardboard boxes" }, { "source": "scannet", "scene_id": "e398684d27", "instruction": " USER: How many umbrellas are in the scene?", "response_gt": [ "2", "2 umbrellas", "I see 2", "I see 2 umbrellas", "I can count 2 umbrellas" ], "response_pred": "2 umbrellas" }, { "source": "scannet", "scene_id": "e398684d27", "instruction": " USER: How many storage cabinets are in the scene?", "response_gt": [ "6", "6 storage cabinets", "I see 6", "I see 6 storage cabinets", "I can count 6 storage cabinets" ], "response_pred": "3 storage cabinets" }, { "source": "scannet", "scene_id": "e398684d27", "instruction": " USER: How many metal sheets are in the scene?", "response_gt": [ "3", "3 metal sheets", "I see 3", "I see 3 metal sheets", "I can count 3 metal sheets" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "e398684d27", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "1", "1 shelfs", "I see 1", "I see 1 shelfs", "I can count 1 shelfs" ], "response_pred": "There are 5 shelves in the scene." }, { "source": "scannet", "scene_id": "e398684d27", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "2", "2 objects", "I see 2", "I see 2 objects", "I can count 2 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "e398684d27", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "e398684d27", "instruction": " USER: How many car doors are in the scene?", "response_gt": [ "1", "1 car doors", "I see 1", "I see 1 car doors", "I can count 1 car doors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "e398684d27", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "1", "1 boxs", "I see 1", "I see 1 boxs", "I can count 1 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "e398684d27", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "1", "1 monitors", "I see 1", "I see 1 monitors", "I can count 1 monitors" ], "response_pred": "2 monitors" }, { "source": "scannet", "scene_id": "e398684d27", "instruction": " USER: How many cable trays are in the scene?", "response_gt": [ "2", "2 cable trays", "I see 2", "I see 2 cable trays", "I can count 2 cable trays" ], "response_pred": "2 cable trays" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "2", "2 cabinets", "I see 2", "I see 2 cabinets", "I can count 2 cabinets" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "3", "3 trash cans", "I see 3", "I see 3 trash cans", "I can count 3 trash cans" ], "response_pred": "4 trash cans" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "2", "2 doors", "I see 2", "I see 2 doors", "I can count 2 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many ceiling lights are in the scene?", "response_gt": [ "5", "5 ceiling lights", "I see 5", "I see 5 ceiling lights", "I can count 5 ceiling lights" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "11", "11 windows", "I see 11", "I see 11 windows", "I can count 11 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many outlets are in the scene?", "response_gt": [ "4", "4 outlets", "I see 4", "I see 4 outlets", "I can count 4 outlets" ], "response_pred": "2 outlets" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "1", "1 heaters", "I see 1", "I see 1 heaters", "I can count 1 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many wall clocks are in the scene?", "response_gt": [ "1", "1 wall clocks", "I see 1", "I see 1 wall clocks", "I can count 1 wall clocks" ], "response_pred": "2 wall clocks" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many pictures are in the scene?", "response_gt": [ "6", "6 pictures", "I see 6", "I see 6 pictures", "I can count 6 pictures" ], "response_pred": "3 pictures" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many pipes are in the scene?", "response_gt": [ "2", "2 pipes", "I see 2", "I see 2 pipes", "I can count 2 pipes" ], "response_pred": "2 pipes" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many fire extinguishers are in the scene?", "response_gt": [ "1", "1 fire extinguishers", "I see 1", "I see 1 fire extinguishers", "I can count 1 fire extinguishers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many switchs are in the scene?", "response_gt": [ "1", "1 switchs", "I see 1", "I see 1 switchs", "I can count 1 switchs" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "4", "4 tables", "I see 4", "I see 4 tables", "I can count 4 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many machines are in the scene?", "response_gt": [ "12", "12 machines", "I see 12", "I see 12 machines", "I can count 12 machines" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "35", "35 objects", "I see 35", "I see 35 objects", "I can count 35 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many cables are in the scene?", "response_gt": [ "2", "2 cables", "I see 2", "I see 2 cables", "I can count 2 cables" ], "response_pred": "3 cables" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many crates are in the scene?", "response_gt": [ "1", "1 crates", "I see 1", "I see 1 crates", "I can count 1 crates" ], "response_pred": "4 crates" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many helmets are in the scene?", "response_gt": [ "3", "3 helmets", "I see 3", "I see 3 helmets", "I can count 3 helmets" ], "response_pred": "2 helmets" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many toolss are in the scene?", "response_gt": [ "2", "2 toolss", "I see 2", "I see 2 toolss", "I can count 2 toolss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many structures are in the scene?", "response_gt": [ "4", "4 structures", "I see 4", "I see 4 structures", "I can count 4 structures" ], "response_pred": "3 structures" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many bags are in the scene?", "response_gt": [ "1", "1 bags", "I see 1", "I see 1 bags", "I can count 1 bags" ], "response_pred": "3 bags" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many jackets are in the scene?", "response_gt": [ "1", "1 jackets", "I see 1", "I see 1 jackets", "I can count 1 jackets" ], "response_pred": "3 jackets" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many cabless are in the scene?", "response_gt": [ "2", "2 cabless", "I see 2", "I see 2 cabless", "I can count 2 cabless" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many stools are in the scene?", "response_gt": [ "1", "1 stools", "I see 1", "I see 1 stools", "I can count 1 stools" ], "response_pred": "2 stools" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many power plugs are in the scene?", "response_gt": [ "1", "1 power plugs", "I see 1", "I see 1 power plugs", "I can count 1 power plugs" ], "response_pred": "2 power plugs" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many first aid kits are in the scene?", "response_gt": [ "1", "1 first aid kits", "I see 1", "I see 1 first aid kits", "I can count 1 first aid kits" ], "response_pred": "2 first aid kits" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many lamps are in the scene?", "response_gt": [ "1", "1 lamps", "I see 1", "I see 1 lamps", "I can count 1 lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many pinboards are in the scene?", "response_gt": [ "2", "2 pinboards", "I see 2", "I see 2 pinboards", "I can count 2 pinboards" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many notices are in the scene?", "response_gt": [ "6", "6 notices", "I see 6", "I see 6 notices", "I can count 6 notices" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "1", "1 boxs", "I see 1", "I see 1 boxs", "I can count 1 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many hammers are in the scene?", "response_gt": [ "1", "1 hammers", "I see 1", "I see 1 hammers", "I can count 1 hammers" ], "response_pred": "2 hammers" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many pallets are in the scene?", "response_gt": [ "1", "1 pallets", "I see 1", "I see 1 pallets", "I can count 1 pallets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "a8bf42d646", "instruction": " USER: How many carts are in the scene?", "response_gt": [ "1", "1 carts", "I see 1", "I see 1 carts", "I can count 1 carts" ], "response_pred": "3 carts" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many shower curtains are in the scene?", "response_gt": [ "1", "1 shower curtains", "I see 1", "I see 1 shower curtains", "I can count 1 shower curtains" ], "response_pred": "2 shower curtains" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many bathroom cabinets are in the scene?", "response_gt": [ "3", "3 bathroom cabinets", "I see 3", "I see 3 bathroom cabinets", "I can count 3 bathroom cabinets" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many washing machines are in the scene?", "response_gt": [ "1", "1 washing machines", "I see 1", "I see 1 washing machines", "I can count 1 washing machines" ], "response_pred": "2 washing machines" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "1", "1 sinks", "I see 1", "I see 1 sinks", "I can count 1 sinks" ], "response_pred": "2 sinks" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "6", "6 bottles", "I see 6", "I see 6 bottles", "I can count 6 bottles" ], "response_pred": "4 bottles" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many bathtubs are in the scene?", "response_gt": [ "1", "1 bathtubs", "I see 1", "I see 1 bathtubs", "I can count 1 bathtubs" ], "response_pred": "2 bathtubs" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "1", "1 ceiling lamps", "I see 1", "I see 1 ceiling lamps", "I can count 1 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "1", "1 heaters", "I see 1", "I see 1 heaters", "I can count 1 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many mirrors are in the scene?", "response_gt": [ "1", "1 mirrors", "I see 1", "I see 1 mirrors", "I can count 1 mirrors" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many mirror lamps are in the scene?", "response_gt": [ "1", "1 mirror lamps", "I see 1", "I see 1 mirror lamps", "I can count 1 mirror lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many towels are in the scene?", "response_gt": [ "6", "6 towels", "I see 6", "I see 6 towels", "I can count 6 towels" ], "response_pred": "2 towels" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many cups are in the scene?", "response_gt": [ "1", "1 cups", "I see 1", "I see 1 cups", "I can count 1 cups" ], "response_pred": "2 cups" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many taps are in the scene?", "response_gt": [ "1", "1 taps", "I see 1", "I see 1 taps", "I can count 1 taps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many toothpastes are in the scene?", "response_gt": [ "1", "1 toothpastes", "I see 1", "I see 1 toothpastes", "I can count 1 toothpastes" ], "response_pred": "2 toothpastes" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many toilets are in the scene?", "response_gt": [ "1", "1 toilets", "I see 1", "I see 1 toilets", "I can count 1 toilets" ], "response_pred": "2 toilets" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many toilet papers are in the scene?", "response_gt": [ "1", "1 toilet papers", "I see 1", "I see 1 toilet papers", "I can count 1 toilet papers" ], "response_pred": "2 toilet papers" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many toilet brushs are in the scene?", "response_gt": [ "1", "1 toilet brushs", "I see 1", "I see 1 toilet brushs", "I can count 1 toilet brushs" ], "response_pred": "2 toilet brush" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "1", "1 trash bins", "I see 1", "I see 1 trash bins", "I can count 1 trash bins" ], "response_pred": "4 trash bins" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "1", "1 windows", "I see 1", "I see 1 windows", "I can count 1 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "45b0dac5e3", "instruction": " USER: How many showers are in the scene?", "response_gt": [ "1", "1 showers", "I see 1", "I see 1 showers", "I can count 1 showers" ], "response_pred": "2 showers" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "11", "11 objects", "I see 11", "I see 11 objects", "I can count 11 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many grab bars are in the scene?", "response_gt": [ "3", "3 grab bars", "I see 3", "I see 3 grab bars", "I can count 3 grab bars" ], "response_pred": "2 grab bars" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many toilet papers are in the scene?", "response_gt": [ "1", "1 toilet papers", "I see 1", "I see 1 toilet papers", "I can count 1 toilet papers" ], "response_pred": "2 toilet papers" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many flush buttons are in the scene?", "response_gt": [ "1", "1 flush buttons", "I see 1", "I see 1 flush buttons", "I can count 1 flush buttons" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many toilet brushs are in the scene?", "response_gt": [ "1", "1 toilet brushs", "I see 1", "I see 1 toilet brushs", "I can count 1 toilet brushs" ], "response_pred": "2 toilet brush" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many showers are in the scene?", "response_gt": [ "1", "1 showers", "I see 1", "I see 1 showers", "I can count 1 showers" ], "response_pred": "2 showers" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many shower valves are in the scene?", "response_gt": [ "1", "1 shower valves", "I see 1", "I see 1 shower valves", "I can count 1 shower valves" ], "response_pred": "2 shower valves" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many towels are in the scene?", "response_gt": [ "2", "2 towels", "I see 2", "I see 2 towels", "I can count 2 towels" ], "response_pred": "2 towels" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many shower loofahs are in the scene?", "response_gt": [ "1", "1 shower loofahs", "I see 1", "I see 1 shower loofahs", "I can count 1 shower loofahs" ], "response_pred": "2 shower loofahs" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many shower curtain rods are in the scene?", "response_gt": [ "1", "1 shower curtain rods", "I see 1", "I see 1 shower curtain rods", "I can count 1 shower curtain rods" ], "response_pred": "2 shower curtain rods" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "1", "1 ceiling lamps", "I see 1", "I see 1 ceiling lamps", "I can count 1 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "1", "1 windows", "I see 1", "I see 1 windows", "I can count 1 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many lamps are in the scene?", "response_gt": [ "1", "1 lamps", "I see 1", "I see 1 lamps", "I can count 1 lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many toothbrushs are in the scene?", "response_gt": [ "1", "1 toothbrushs", "I see 1", "I see 1 toothbrushs", "I can count 1 toothbrushs" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many tooth pastes are in the scene?", "response_gt": [ "1", "1 tooth pastes", "I see 1", "I see 1 tooth pastes", "I can count 1 tooth pastes" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many shower curtains are in the scene?", "response_gt": [ "1", "1 shower curtains", "I see 1", "I see 1 shower curtains", "I can count 1 shower curtains" ], "response_pred": "2 shower curtains" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many towel warmers are in the scene?", "response_gt": [ "1", "1 towel warmers", "I see 1", "I see 1 towel warmers", "I can count 1 towel warmers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "1", "1 sinks", "I see 1", "I see 1 sinks", "I can count 1 sinks" ], "response_pred": "2 sinks" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many toilets are in the scene?", "response_gt": [ "1", "1 toilets", "I see 1", "I see 1 toilets", "I can count 1 toilets" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many bathroom mats are in the scene?", "response_gt": [ "1", "1 bathroom mats", "I see 1", "I see 1 bathroom mats", "I can count 1 bathroom mats" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many toothbrush holders are in the scene?", "response_gt": [ "1", "1 toothbrush holders", "I see 1", "I see 1 toothbrush holders", "I can count 1 toothbrush holders" ], "response_pred": "2 toothbrush holders" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many hand soaps are in the scene?", "response_gt": [ "1", "1 hand soaps", "I see 1", "I see 1 hand soaps", "I can count 1 hand soaps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many shampoo bottles are in the scene?", "response_gt": [ "1", "1 shampoo bottles", "I see 1", "I see 1 shampoo bottles", "I can count 1 shampoo bottles" ], "response_pred": "2 shampoo bottles" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many taps are in the scene?", "response_gt": [ "1", "1 taps", "I see 1", "I see 1 taps", "I can count 1 taps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many mops are in the scene?", "response_gt": [ "1", "1 mops", "I see 1", "I see 1 mops", "I can count 1 mops" ], "response_pred": "2 mops" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many brooms are in the scene?", "response_gt": [ "1", "1 brooms", "I see 1", "I see 1 brooms", "I can count 1 brooms" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many dustpans are in the scene?", "response_gt": [ "1", "1 dustpans", "I see 1", "I see 1 dustpans", "I can count 1 dustpans" ], "response_pred": "2 dustpans" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many jeanss are in the scene?", "response_gt": [ "2", "2 jeanss", "I see 2", "I see 2 jeanss", "I can count 2 jeanss" ], "response_pred": "2 jeanss" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many trouserss are in the scene?", "response_gt": [ "1", "1 trouserss", "I see 1", "I see 1 trouserss", "I can count 1 trouserss" ], "response_pred": "2 trouserss" }, { "source": "scannet", "scene_id": "31a2c91c43", "instruction": " USER: How many pantss are in the scene?", "response_gt": [ "2", "2 pantss", "I see 2", "I see 2 pantss", "I can count 2 pantss" ], "response_pred": "2 pantss" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many shower taps are in the scene?", "response_gt": [ "1", "1 shower taps", "I see 1", "I see 1 shower taps", "I can count 1 shower taps" ], "response_pred": "1 shower tap" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many shower ceilings are in the scene?", "response_gt": [ "1", "1 shower ceilings", "I see 1", "I see 1 shower ceilings", "I can count 1 shower ceilings" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "4", "4 cabinets", "I see 4", "I see 4 cabinets", "I can count 4 cabinets" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many showers are in the scene?", "response_gt": [ "1", "1 showers", "I see 1", "I see 1 showers", "I can count 1 showers" ], "response_pred": "2 showers" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "2", "2 doors", "I see 2", "I see 2 doors", "I can count 2 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many blindss are in the scene?", "response_gt": [ "1", "1 blindss", "I see 1", "I see 1 blindss", "I can count 1 blindss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many ceiling lights are in the scene?", "response_gt": [ "2", "2 ceiling lights", "I see 2", "I see 2 ceiling lights", "I can count 2 ceiling lights" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many shower heads are in the scene?", "response_gt": [ "1", "1 shower heads", "I see 1", "I see 1 shower heads", "I can count 1 shower heads" ], "response_pred": "2 shower heads" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many shower walls are in the scene?", "response_gt": [ "3", "3 shower walls", "I see 3", "I see 3 shower walls", "I can count 3 shower walls" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many soap bottles are in the scene?", "response_gt": [ "7", "7 soap bottles", "I see 7", "I see 7 soap bottles", "I can count 7 soap bottles" ], "response_pred": "2 soap bottles" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many bar soaps are in the scene?", "response_gt": [ "1", "1 bar soaps", "I see 1", "I see 1 bar soaps", "I can count 1 bar soaps" ], "response_pred": "2 bar soaps" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many shower floors are in the scene?", "response_gt": [ "1", "1 shower floors", "I see 1", "I see 1 shower floors", "I can count 1 shower floors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "9", "9 objects", "I see 9", "I see 9 objects", "I can count 9 objects" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many mirrors are in the scene?", "response_gt": [ "2", "2 mirrors", "I see 2", "I see 2 mirrors", "I can count 2 mirrors" ], "response_pred": "2 mirrors" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many pictures are in the scene?", "response_gt": [ "1", "1 pictures", "I see 1", "I see 1 pictures", "I can count 1 pictures" ], "response_pred": "2 pictures" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "1", "1 trash bins", "I see 1", "I see 1 trash bins", "I can count 1 trash bins" ], "response_pred": "4 trash bins" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many shower rugs are in the scene?", "response_gt": [ "1", "1 shower rugs", "I see 1", "I see 1 shower rugs", "I can count 1 shower rugs" ], "response_pred": "2 shower rugs" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many towels are in the scene?", "response_gt": [ "8", "8 towels", "I see 8", "I see 8 towels", "I can count 8 towels" ], "response_pred": "2 towels" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many washbasins are in the scene?", "response_gt": [ "1", "1 washbasins", "I see 1", "I see 1 washbasins", "I can count 1 washbasins" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many water taps are in the scene?", "response_gt": [ "2", "2 water taps", "I see 2", "I see 2 water taps", "I can count 2 water taps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many lights are in the scene?", "response_gt": [ "6", "6 lights", "I see 6", "I see 6 lights", "I can count 6 lights" ], "response_pred": "2 lights" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many bathtubs are in the scene?", "response_gt": [ "1", "1 bathtubs", "I see 1", "I see 1 bathtubs", "I can count 1 bathtubs" ], "response_pred": "2 bathtubs" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many toothpastes are in the scene?", "response_gt": [ "2", "2 toothpastes", "I see 2", "I see 2 toothpastes", "I can count 2 toothpastes" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "1", "1 bottles", "I see 1", "I see 1 bottles", "I can count 1 bottles" ], "response_pred": "6 bottles" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many ceilng lights are in the scene?", "response_gt": [ "7", "7 ceilng lights", "I see 7", "I see 7 ceilng lights", "I can count 7 ceilng lights" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "2", "2 smoke detectors", "I see 2", "I see 2 smoke detectors", "I can count 2 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many ventilations are in the scene?", "response_gt": [ "2", "2 ventilations", "I see 2", "I see 2 ventilations", "I can count 2 ventilations" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many papers are in the scene?", "response_gt": [ "1", "1 papers", "I see 1", "I see 1 papers", "I can count 1 papers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many folding chairs are in the scene?", "response_gt": [ "1", "1 folding chairs", "I see 1", "I see 1 folding chairs", "I can count 1 folding chairs" ], "response_pred": "2 folding chairs" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many toilet papers are in the scene?", "response_gt": [ "1", "1 toilet papers", "I see 1", "I see 1 toilet papers", "I can count 1 toilet papers" ], "response_pred": "2 toilet papers" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many toilet brushs are in the scene?", "response_gt": [ "1", "1 toilet brushs", "I see 1", "I see 1 toilet brushs", "I can count 1 toilet brushs" ], "response_pred": "2 toilet brush" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many structures are in the scene?", "response_gt": [ "1", "1 structures", "I see 1", "I see 1 structures", "I can count 1 structures" ], "response_pred": "2 structures" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many ceiling vents are in the scene?", "response_gt": [ "2", "2 ceiling vents", "I see 2", "I see 2 ceiling vents", "I can count 2 ceiling vents" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "1", "1 shelfs", "I see 1", "I see 1 shelfs", "I can count 1 shelfs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "1", "1 heaters", "I see 1", "I see 1 heaters", "I can count 1 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "e7af285f7d", "instruction": " USER: How many bath mats are in the scene?", "response_gt": [ "1", "1 bath mats", "I see 1", "I see 1 bath mats", "I can count 1 bath mats" ], "response_pred": "2 bath mats" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many taps are in the scene?", "response_gt": [ "1", "1 taps", "I see 1", "I see 1 taps", "I can count 1 taps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "27", "27 objects", "I see 27", "I see 27 objects", "I can count 27 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "1", "1 ceiling lamps", "I see 1", "I see 1 ceiling lamps", "I can count 1 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many ceiling lights are in the scene?", "response_gt": [ "1", "1 ceiling lights", "I see 1", "I see 1 ceiling lights", "I can count 1 ceiling lights" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many shower rods are in the scene?", "response_gt": [ "1", "1 shower rods", "I see 1", "I see 1 shower rods", "I can count 1 shower rods" ], "response_pred": "2 shower rods" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many shower valves are in the scene?", "response_gt": [ "1", "1 shower valves", "I see 1", "I see 1 shower valves", "I can count 1 shower valves" ], "response_pred": "2 shower valves" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many shower heads are in the scene?", "response_gt": [ "1", "1 shower heads", "I see 1", "I see 1 shower heads", "I can count 1 shower heads" ], "response_pred": "2 shower heads" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many shower floors are in the scene?", "response_gt": [ "2", "2 shower floors", "I see 2", "I see 2 shower floors", "I can count 2 shower floors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many shower walls are in the scene?", "response_gt": [ "4", "4 shower walls", "I see 4", "I see 4 shower walls", "I can count 4 shower walls" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many shampoo bottles are in the scene?", "response_gt": [ "1", "1 shampoo bottles", "I see 1", "I see 1 shampoo bottles", "I can count 1 shampoo bottles" ], "response_pred": "2 shampoo bottles" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many towels are in the scene?", "response_gt": [ "3", "3 towels", "I see 3", "I see 3 towels", "I can count 3 towels" ], "response_pred": "2 towels" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many toilet paper holders are in the scene?", "response_gt": [ "1", "1 toilet paper holders", "I see 1", "I see 1 toilet paper holders", "I can count 1 toilet paper holders" ], "response_pred": "2 toilet paper holders" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "9", "9 bottles", "I see 9", "I see 9 bottles", "I can count 9 bottles" ], "response_pred": "6 bottles" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many sockets are in the scene?", "response_gt": [ "1", "1 sockets", "I see 1", "I see 1 sockets", "I can count 1 sockets" ], "response_pred": "2 sockets" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many toilet paper rolls are in the scene?", "response_gt": [ "1", "1 toilet paper rolls", "I see 1", "I see 1 toilet paper rolls", "I can count 1 toilet paper rolls" ], "response_pred": "3 toilet paper rolls" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many exhaust fans are in the scene?", "response_gt": [ "1", "1 exhaust fans", "I see 1", "I see 1 exhaust fans", "I can count 1 exhaust fans" ], "response_pred": "2 exhaust fans" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many switchs are in the scene?", "response_gt": [ "1", "1 switchs", "I see 1", "I see 1 switchs", "I can count 1 switchs" ], "response_pred": "2 switchs" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many soap dispensers are in the scene?", "response_gt": [ "1", "1 soap dispensers", "I see 1", "I see 1 soap dispensers", "I can count 1 soap dispensers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "1", "1 sinks", "I see 1", "I see 1 sinks", "I can count 1 sinks" ], "response_pred": "4 sinks" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many toilets are in the scene?", "response_gt": [ "1", "1 toilets", "I see 1", "I see 1 toilets", "I can count 1 toilets" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many cups are in the scene?", "response_gt": [ "1", "1 cups", "I see 1", "I see 1 cups", "I can count 1 cups" ], "response_pred": "2 cups" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many counters are in the scene?", "response_gt": [ "1", "1 counters", "I see 1", "I see 1 counters", "I can count 1 counters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many toilet cleaners are in the scene?", "response_gt": [ "1", "1 toilet cleaners", "I see 1", "I see 1 toilet cleaners", "I can count 1 toilet cleaners" ], "response_pred": "2 toilet cleaners" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "1", "1 cabinets", "I see 1", "I see 1 cabinets", "I can count 1 cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many flushess are in the scene?", "response_gt": [ "1", "1 flushess", "I see 1", "I see 1 flushess", "I can count 1 flushess" ], "response_pred": "2 flushess" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many hairdryers are in the scene?", "response_gt": [ "1", "1 hairdryers", "I see 1", "I see 1 hairdryers", "I can count 1 hairdryers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many cleaners are in the scene?", "response_gt": [ "1", "1 cleaners", "I see 1", "I see 1 cleaners", "I can count 1 cleaners" ], "response_pred": "2 cleaners" }, { "source": "scannet", "scene_id": "286b55a2bf", "instruction": " USER: How many toothbrushs are in the scene?", "response_gt": [ "2", "2 toothbrushs", "I see 2", "I see 2 toothbrushs", "I can count 2 toothbrushs" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many towels are in the scene?", "response_gt": [ "2", "2 towels", "I see 2", "I see 2 towels", "I can count 2 towels" ], "response_pred": "2 towels" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many structures are in the scene?", "response_gt": [ "4", "4 structures", "I see 4", "I see 4 structures", "I can count 4 structures" ], "response_pred": "2 structures" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many shower walls are in the scene?", "response_gt": [ "3", "3 shower walls", "I see 3", "I see 3 shower walls", "I can count 3 shower walls" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "1", "1 sinks", "I see 1", "I see 1 sinks", "I can count 1 sinks" ], "response_pred": "2 sinks" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many faucets are in the scene?", "response_gt": [ "1", "1 faucets", "I see 1", "I see 1 faucets", "I can count 1 faucets" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many hand showers are in the scene?", "response_gt": [ "1", "1 hand showers", "I see 1", "I see 1 hand showers", "I can count 1 hand showers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many shower handles are in the scene?", "response_gt": [ "1", "1 shower handles", "I see 1", "I see 1 shower handles", "I can count 1 shower handles" ], "response_pred": "2 shower handles" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many shower gels are in the scene?", "response_gt": [ "2", "2 shower gels", "I see 2", "I see 2 shower gels", "I can count 2 shower gels" ], "response_pred": "2 shower gels" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many shower faucets are in the scene?", "response_gt": [ "1", "1 shower faucets", "I see 1", "I see 1 shower faucets", "I can count 1 shower faucets" ], "response_pred": "2 shower faucets" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many recessed shower shelves are in the scene?", "response_gt": [ "1", "1 recessed shower shelves", "I see 1", "I see 1 recessed shower shelves", "I can count 1 recessed shower shelves" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many bathroom counters are in the scene?", "response_gt": [ "1", "1 bathroom counters", "I see 1", "I see 1 bathroom counters", "I can count 1 bathroom counters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many shower floors are in the scene?", "response_gt": [ "1", "1 shower floors", "I see 1", "I see 1 shower floors", "I can count 1 shower floors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many splits are in the scene?", "response_gt": [ "3", "3 splits", "I see 3", "I see 3 splits", "I can count 3 splits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many toothpastes are in the scene?", "response_gt": [ "1", "1 toothpastes", "I see 1", "I see 1 toothpastes", "I can count 1 toothpastes" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "8", "8 objects", "I see 8", "I see 8 objects", "I can count 8 objects" ], "response_pred": "2 objects" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many hand washing soaps are in the scene?", "response_gt": [ "1", "1 hand washing soaps", "I see 1", "I see 1 hand washing soaps", "I can count 1 hand washing soaps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "1", "1 bottles", "I see 1", "I see 1 bottles", "I can count 1 bottles" ], "response_pred": "6 bottles" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many toilet paper rollss are in the scene?", "response_gt": [ "1", "1 toilet paper rollss", "I see 1", "I see 1 toilet paper rollss", "I can count 1 toilet paper rollss" ], "response_pred": "4 toilet paper rolls" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many toilet paper holders are in the scene?", "response_gt": [ "1", "1 toilet paper holders", "I see 1", "I see 1 toilet paper holders", "I can count 1 toilet paper holders" ], "response_pred": "2 toilet paper holders" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many toilet brushs are in the scene?", "response_gt": [ "1", "1 toilet brushs", "I see 1", "I see 1 toilet brushs", "I can count 1 toilet brushs" ], "response_pred": "2 toilet brushs" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many toilets are in the scene?", "response_gt": [ "1", "1 toilets", "I see 1", "I see 1 toilets", "I can count 1 toilets" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "1", "1 cabinets", "I see 1", "I see 1 cabinets", "I can count 1 cabinets" ], "response_pred": "three" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many shelves are in the scene?", "response_gt": [ "1", "1 shelves", "I see 1", "I see 1 shelves", "I can count 1 shelves" ], "response_pred": "3 shelves" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many towelss are in the scene?", "response_gt": [ "1", "1 towelss", "I see 1", "I see 1 towelss", "I can count 1 towelss" ], "response_pred": "2 towels" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many shaving foams are in the scene?", "response_gt": [ "1", "1 shaving foams", "I see 1", "I see 1 shaving foams", "I can count 1 shaving foams" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many cleaning liquids are in the scene?", "response_gt": [ "1", "1 cleaning liquids", "I see 1", "I see 1 cleaning liquids", "I can count 1 cleaning liquids" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many surface cleaning liquids are in the scene?", "response_gt": [ "1", "1 surface cleaning liquids", "I see 1", "I see 1 surface cleaning liquids", "I can count 1 surface cleaning liquids" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many cloths are in the scene?", "response_gt": [ "1", "1 cloths", "I see 1", "I see 1 cloths", "I can count 1 cloths" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many razors are in the scene?", "response_gt": [ "1", "1 razors", "I see 1", "I see 1 razors", "I can count 1 razors" ], "response_pred": "2 razors" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many toilet papers are in the scene?", "response_gt": [ "1", "1 toilet papers", "I see 1", "I see 1 toilet papers", "I can count 1 toilet papers" ], "response_pred": "2 toilet papers" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many ventilators are in the scene?", "response_gt": [ "1", "1 ventilators", "I see 1", "I see 1 ventilators", "I can count 1 ventilators" ], "response_pred": "2 ventilators" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many toilet flush buttons are in the scene?", "response_gt": [ "1", "1 toilet flush buttons", "I see 1", "I see 1 toilet flush buttons", "I can count 1 toilet flush buttons" ], "response_pred": "2 toilet flush buttons" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many recessed shelves are in the scene?", "response_gt": [ "1", "1 recessed shelves", "I see 1", "I see 1 recessed shelves", "I can count 1 recessed shelves" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many toothbrush holders are in the scene?", "response_gt": [ "1", "1 toothbrush holders", "I see 1", "I see 1 toothbrush holders", "I can count 1 toothbrush holders" ], "response_pred": "2 toothbrush holders" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many toothbrushs are in the scene?", "response_gt": [ "1", "1 toothbrushs", "I see 1", "I see 1 toothbrushs", "I can count 1 toothbrushs" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "7bc286c1b6", "instruction": " USER: How many lights are in the scene?", "response_gt": [ "1", "1 lights", "I see 1", "I see 1 lights", "I can count 1 lights" ], "response_pred": "2 lights" }, { "source": "scannet", "scene_id": "f3685d06a9", "instruction": " USER: How many air vents are in the scene?", "response_gt": [ "2", "2 air vents", "I see 2", "I see 2 air vents", "I can count 2 air vents" ], "response_pred": "2 air vents" }, { "source": "scannet", "scene_id": "f3685d06a9", "instruction": " USER: How many paper towel dispensers are in the scene?", "response_gt": [ "1", "1 paper towel dispensers", "I see 1", "I see 1 paper towel dispensers", "I can count 1 paper towel dispensers" ], "response_pred": "2 paper towel dispensers" }, { "source": "scannet", "scene_id": "f3685d06a9", "instruction": " USER: How many toilets are in the scene?", "response_gt": [ "2", "2 toilets", "I see 2", "I see 2 toilets", "I can count 2 toilets" ], "response_pred": "2 toilets" }, { "source": "scannet", "scene_id": "f3685d06a9", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "3", "3 trash bins", "I see 3", "I see 3 trash bins", "I can count 3 trash bins" ], "response_pred": "4 trash bins" }, { "source": "scannet", "scene_id": "f3685d06a9", "instruction": " USER: How many paper towels are in the scene?", "response_gt": [ "3", "3 paper towels", "I see 3", "I see 3 paper towels", "I can count 3 paper towels" ], "response_pred": "4 paper towels" }, { "source": "scannet", "scene_id": "f3685d06a9", "instruction": " USER: How many taps are in the scene?", "response_gt": [ "1", "1 taps", "I see 1", "I see 1 taps", "I can count 1 taps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f3685d06a9", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "3", "3 doors", "I see 3", "I see 3 doors", "I can count 3 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "f3685d06a9", "instruction": " USER: How many door vents are in the scene?", "response_gt": [ "1", "1 door vents", "I see 1", "I see 1 door vents", "I can count 1 door vents" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f3685d06a9", "instruction": " USER: How many posters are in the scene?", "response_gt": [ "1", "1 posters", "I see 1", "I see 1 posters", "I can count 1 posters" ], "response_pred": "4 posters" }, { "source": "scannet", "scene_id": "f3685d06a9", "instruction": " USER: How many soap dispensers are in the scene?", "response_gt": [ "1", "1 soap dispensers", "I see 1", "I see 1 soap dispensers", "I can count 1 soap dispensers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f3685d06a9", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "1", "1 sinks", "I see 1", "I see 1 sinks", "I can count 1 sinks" ], "response_pred": "2 sinks" }, { "source": "scannet", "scene_id": "f3685d06a9", "instruction": " USER: How many toilet paper dispensers are in the scene?", "response_gt": [ "1", "1 toilet paper dispensers", "I see 1", "I see 1 toilet paper dispensers", "I can count 1 toilet paper dispensers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f3685d06a9", "instruction": " USER: How many toilet papers are in the scene?", "response_gt": [ "1", "1 toilet papers", "I see 1", "I see 1 toilet papers", "I can count 1 toilet papers" ], "response_pred": "2 toilet papers" }, { "source": "scannet", "scene_id": "f3685d06a9", "instruction": " USER: How many toilet brushs are in the scene?", "response_gt": [ "2", "2 toilet brushs", "I see 2", "I see 2 toilet brushs", "I can count 2 toilet brushs" ], "response_pred": "2 toilet brush" }, { "source": "scannet", "scene_id": "f3685d06a9", "instruction": " USER: How many flush buttons are in the scene?", "response_gt": [ "2", "2 flush buttons", "I see 2", "I see 2 flush buttons", "I can count 2 flush buttons" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many computer towers are in the scene?", "response_gt": [ "25", "25 computer towers", "I see 25", "I see 25 computer towers", "I can count 25 computer towers" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "1", "1 chairs", "I see 1", "I see 1 chairs", "I can count 1 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many mouses are in the scene?", "response_gt": [ "3", "3 mouses", "I see 3", "I see 3 mouses", "I can count 3 mouses" ], "response_pred": "2 mouses" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "2", "2 ceiling lamps", "I see 2", "I see 2 ceiling lamps", "I can count 2 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "2", "2 windows", "I see 2", "I see 2 windows", "I can count 2 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many structures are in the scene?", "response_gt": [ "3", "3 structures", "I see 3", "I see 3 structures", "I can count 3 structures" ], "response_pred": "3 structures" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "1", "1 boxs", "I see 1", "I see 1 boxs", "I can count 1 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "2", "2 smoke detectors", "I see 2", "I see 2 smoke detectors", "I can count 2 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "2", "2 keyboards", "I see 2", "I see 2 keyboards", "I can count 2 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many storage racks are in the scene?", "response_gt": [ "3", "3 storage racks", "I see 3", "I see 3 storage racks", "I can count 3 storage racks" ], "response_pred": "3 storage racks" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "9", "9 objects", "I see 9", "I see 9 objects", "I can count 9 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "1", "1 monitors", "I see 1", "I see 1 monitors", "I can count 1 monitors" ], "response_pred": "2 monitors" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many window sills are in the scene?", "response_gt": [ "1", "1 window sills", "I see 1", "I see 1 window sills", "I can count 1 window sills" ], "response_pred": "2 window sills" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "2", "2 heaters", "I see 2", "I see 2 heaters", "I can count 2 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many pipes are in the scene?", "response_gt": [ "4", "4 pipes", "I see 4", "I see 4 pipes", "I can count 4 pipes" ], "response_pred": "2 pipes" }, { "source": "scannet", "scene_id": "b0a08200c9", "instruction": " USER: How many electrical ducts are in the scene?", "response_gt": [ "6", "6 electrical ducts", "I see 6", "I see 6 electrical ducts", "I can count 6 electrical ducts" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "825d228aec", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "6", "6 boxs", "I see 6", "I see 6 boxs", "I can count 6 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "825d228aec", "instruction": " USER: How many power strips are in the scene?", "response_gt": [ "2", "2 power strips", "I see 2", "I see 2 power strips", "I can count 2 power strips" ], "response_pred": "three" }, { "source": "scannet", "scene_id": "825d228aec", "instruction": " USER: How many cable sockets are in the scene?", "response_gt": [ "5", "5 cable sockets", "I see 5", "I see 5 cable sockets", "I can count 5 cable sockets" ], "response_pred": "4 cable sockets" }, { "source": "scannet", "scene_id": "825d228aec", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "1", "1 trash cans", "I see 1", "I see 1 trash cans", "I can count 1 trash cans" ], "response_pred": "4 trash cans" }, { "source": "scannet", "scene_id": "825d228aec", "instruction": " USER: How many printers are in the scene?", "response_gt": [ "1", "1 printers", "I see 1", "I see 1 printers", "I can count 1 printers" ], "response_pred": "2 printers" }, { "source": "scannet", "scene_id": "825d228aec", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "1", "1 smoke detectors", "I see 1", "I see 1 smoke detectors", "I can count 1 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "825d228aec", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "825d228aec", "instruction": " USER: How many splits are in the scene?", "response_gt": [ "1", "1 splits", "I see 1", "I see 1 splits", "I can count 1 splits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "825d228aec", "instruction": " USER: How many paper shredders are in the scene?", "response_gt": [ "1", "1 paper shredders", "I see 1", "I see 1 paper shredders", "I can count 1 paper shredders" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "825d228aec", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "1", "1 tables", "I see 1", "I see 1 tables", "I can count 1 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "825d228aec", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "1", "1 shelfs", "I see 1", "I see 1 shelfs", "I can count 1 shelfs" ], "response_pred": "There are 5 shelves in the scene." }, { "source": "scannet", "scene_id": "825d228aec", "instruction": " USER: How many air conditioners are in the scene?", "response_gt": [ "1", "1 air conditioners", "I see 1", "I see 1 air conditioners", "I can count 1 air conditioners" ], "response_pred": "2 air conditioners" }, { "source": "scannet", "scene_id": "825d228aec", "instruction": " USER: How many exhaust fans are in the scene?", "response_gt": [ "2", "2 exhaust fans", "I see 2", "I see 2 exhaust fans", "I can count 2 exhaust fans" ], "response_pred": "2 exhaust fans" }, { "source": "scannet", "scene_id": "a980334473", "instruction": " USER: How many printers are in the scene?", "response_gt": [ "1", "1 printers", "I see 1", "I see 1 printers", "I can count 1 printers" ], "response_pred": "2 printers" }, { "source": "scannet", "scene_id": "a980334473", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "1", "1 ceiling lamps", "I see 1", "I see 1 ceiling lamps", "I can count 1 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "a980334473", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "1", "1 smoke detectors", "I see 1", "I see 1 smoke detectors", "I can count 1 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "a980334473", "instruction": " USER: How many exhaust fans are in the scene?", "response_gt": [ "2", "2 exhaust fans", "I see 2", "I see 2 exhaust fans", "I can count 2 exhaust fans" ], "response_pred": "2 exhaust fans" }, { "source": "scannet", "scene_id": "a980334473", "instruction": " USER: How many splits are in the scene?", "response_gt": [ "1", "1 splits", "I see 1", "I see 1 splits", "I can count 1 splits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "a980334473", "instruction": " USER: How many air conditioners are in the scene?", "response_gt": [ "1", "1 air conditioners", "I see 1", "I see 1 air conditioners", "I can count 1 air conditioners" ], "response_pred": "2 air conditioners" }, { "source": "scannet", "scene_id": "a980334473", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "4", "4 boxs", "I see 4", "I see 4 boxs", "I can count 4 boxs" ], "response_pred": "4 boxes" }, { "source": "scannet", "scene_id": "a980334473", "instruction": " USER: How many papers are in the scene?", "response_gt": [ "2", "2 papers", "I see 2", "I see 2 papers", "I can count 2 papers" ], "response_pred": "3 papers" }, { "source": "scannet", "scene_id": "a980334473", "instruction": " USER: How many photocopy machines are in the scene?", "response_gt": [ "1", "1 photocopy machines", "I see 1", "I see 1 photocopy machines", "I can count 1 photocopy machines" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "a980334473", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "1", "1 chairs", "I see 1", "I see 1 chairs", "I can count 1 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "a980334473", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "1", "1 tables", "I see 1", "I see 1 tables", "I can count 1 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "a980334473", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "a980334473", "instruction": " USER: How many electrical ducts are in the scene?", "response_gt": [ "5", "5 electrical ducts", "I see 5", "I see 5 electrical ducts", "I can count 5 electrical ducts" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "a980334473", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "1", "1 objects", "I see 1", "I see 1 objects", "I can count 1 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "a980334473", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "1", "1 shelfs", "I see 1", "I see 1 shelfs", "I can count 1 shelfs" ], "response_pred": "There are 5 shelves in the scene." }, { "source": "scannet", "scene_id": "f2dc06b1d2", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "6", "6 boxs", "I see 6", "I see 6 boxs", "I can count 6 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "f2dc06b1d2", "instruction": " USER: How many office chairs are in the scene?", "response_gt": [ "1", "1 office chairs", "I see 1", "I see 1 office chairs", "I can count 1 office chairs" ], "response_pred": "4 office chairs" }, { "source": "scannet", "scene_id": "f2dc06b1d2", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "f2dc06b1d2", "instruction": " USER: How many splits are in the scene?", "response_gt": [ "5", "5 splits", "I see 5", "I see 5 splits", "I can count 5 splits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f2dc06b1d2", "instruction": " USER: How many exhaust fans are in the scene?", "response_gt": [ "2", "2 exhaust fans", "I see 2", "I see 2 exhaust fans", "I can count 2 exhaust fans" ], "response_pred": "2 exhaust fans" }, { "source": "scannet", "scene_id": "f2dc06b1d2", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "1", "1 shelfs", "I see 1", "I see 1 shelfs", "I can count 1 shelfs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "f2dc06b1d2", "instruction": " USER: How many air ducts are in the scene?", "response_gt": [ "1", "1 air ducts", "I see 1", "I see 1 air ducts", "I can count 1 air ducts" ], "response_pred": "2 air ducts" }, { "source": "scannet", "scene_id": "f2dc06b1d2", "instruction": " USER: How many boards are in the scene?", "response_gt": [ "4", "4 boards", "I see 4", "I see 4 boards", "I can count 4 boards" ], "response_pred": "4 boards" }, { "source": "scannet", "scene_id": "f2dc06b1d2", "instruction": " USER: How many electrical ducts are in the scene?", "response_gt": [ "1", "1 electrical ducts", "I see 1", "I see 1 electrical ducts", "I can count 1 electrical ducts" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f2dc06b1d2", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "1", "1 smoke detectors", "I see 1", "I see 1 smoke detectors", "I can count 1 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f2dc06b1d2", "instruction": " USER: How many paper rams are in the scene?", "response_gt": [ "2", "2 paper rams", "I see 2", "I see 2 paper rams", "I can count 2 paper rams" ], "response_pred": "2 paper rams" }, { "source": "scannet", "scene_id": "f2dc06b1d2", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "2", "2 objects", "I see 2", "I see 2 objects", "I can count 2 objects" ], "response_pred": "2 objects" }, { "source": "scannet", "scene_id": "f2dc06b1d2", "instruction": " USER: How many copiers are in the scene?", "response_gt": [ "1", "1 copiers", "I see 1", "I see 1 copiers", "I can count 1 copiers" ], "response_pred": "2 copiers" }, { "source": "scannet", "scene_id": "f2dc06b1d2", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "1", "1 trash bins", "I see 1", "I see 1 trash bins", "I can count 1 trash bins" ], "response_pred": "4 trash bins" }, { "source": "scannet", "scene_id": "f2dc06b1d2", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "1", "1 tables", "I see 1", "I see 1 tables", "I can count 1 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "f2dc06b1d2", "instruction": " USER: How many paper staplers are in the scene?", "response_gt": [ "4", "4 paper staplers", "I see 4", "I see 4 paper staplers", "I can count 4 paper staplers" ], "response_pred": "2 paper staplers" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many shower curtains are in the scene?", "response_gt": [ "1", "1 shower curtains", "I see 1", "I see 1 shower curtains", "I can count 1 shower curtains" ], "response_pred": "2 shower curtains" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many power panels are in the scene?", "response_gt": [ "1", "1 power panels", "I see 1", "I see 1 power panels", "I can count 1 power panels" ], "response_pred": "2 power panels" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many curtain rods are in the scene?", "response_gt": [ "1", "1 curtain rods", "I see 1", "I see 1 curtain rods", "I can count 1 curtain rods" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many wardrobes are in the scene?", "response_gt": [ "1", "1 wardrobes", "I see 1", "I see 1 wardrobes", "I can count 1 wardrobes" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "4", "4 doors", "I see 4", "I see 4 doors", "I can count 4 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many bedside tables are in the scene?", "response_gt": [ "1", "1 bedside tables", "I see 1", "I see 1 bedside tables", "I can count 1 bedside tables" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many sofas are in the scene?", "response_gt": [ "1", "1 sofas", "I see 1", "I see 1 sofas", "I can count 1 sofas" ], "response_pred": "2 sofas" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many curtains are in the scene?", "response_gt": [ "3", "3 curtains", "I see 3", "I see 3 curtains", "I can count 3 curtains" ], "response_pred": "2 curtains" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many faucets are in the scene?", "response_gt": [ "1", "1 faucets", "I see 1", "I see 1 faucets", "I can count 1 faucets" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many soap bottles are in the scene?", "response_gt": [ "1", "1 soap bottles", "I see 1", "I see 1 soap bottles", "I can count 1 soap bottles" ], "response_pred": "2 soap bottles" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "1", "1 sinks", "I see 1", "I see 1 sinks", "I can count 1 sinks" ], "response_pred": "2 sinks" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many shower rugs are in the scene?", "response_gt": [ "1", "1 shower rugs", "I see 1", "I see 1 shower rugs", "I can count 1 shower rugs" ], "response_pred": "2 shower rugs" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many shower floors are in the scene?", "response_gt": [ "1", "1 shower floors", "I see 1", "I see 1 shower floors", "I can count 1 shower floors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many toilets are in the scene?", "response_gt": [ "1", "1 toilets", "I see 1", "I see 1 toilets", "I can count 1 toilets" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many armchairs are in the scene?", "response_gt": [ "1", "1 armchairs", "I see 1", "I see 1 armchairs", "I can count 1 armchairs" ], "response_pred": "2 armchairs" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many pillows are in the scene?", "response_gt": [ "4", "4 pillows", "I see 4", "I see 4 pillows", "I can count 4 pillows" ], "response_pred": "4 pillows" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "1", "1 chairs", "I see 1", "I see 1 chairs", "I can count 1 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "2", "2 cabinets", "I see 2", "I see 2 cabinets", "I can count 2 cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many bathtubs are in the scene?", "response_gt": [ "1", "1 bathtubs", "I see 1", "I see 1 bathtubs", "I can count 1 bathtubs" ], "response_pred": "2 bathtubs" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many bathroom shelfs are in the scene?", "response_gt": [ "1", "1 bathroom shelfs", "I see 1", "I see 1 bathroom shelfs", "I can count 1 bathroom shelfs" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many towels are in the scene?", "response_gt": [ "2", "2 towels", "I see 2", "I see 2 towels", "I can count 2 towels" ], "response_pred": "2 towels" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "2", "2 heaters", "I see 2", "I see 2 heaters", "I can count 2 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many rugs are in the scene?", "response_gt": [ "2", "2 rugs", "I see 2", "I see 2 rugs", "I can count 2 rugs" ], "response_pred": "2 rugs" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many mop cloths are in the scene?", "response_gt": [ "1", "1 mop cloths", "I see 1", "I see 1 mop cloths", "I can count 1 mop cloths" ], "response_pred": "2 mop cloths" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many baskets are in the scene?", "response_gt": [ "1", "1 baskets", "I see 1", "I see 1 baskets", "I can count 1 baskets" ], "response_pred": "4 baskets" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many beds are in the scene?", "response_gt": [ "1", "1 beds", "I see 1", "I see 1 beds", "I can count 1 beds" ], "response_pred": "2 beds" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many carpets are in the scene?", "response_gt": [ "1", "1 carpets", "I see 1", "I see 1 carpets", "I can count 1 carpets" ], "response_pred": "2 carpets" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many tvs are in the scene?", "response_gt": [ "1", "1 tvs", "I see 1", "I see 1 tvs", "I can count 1 tvs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many tv stands are in the scene?", "response_gt": [ "1", "1 tv stands", "I see 1", "I see 1 tv stands", "I can count 1 tv stands" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many pots are in the scene?", "response_gt": [ "1", "1 pots", "I see 1", "I see 1 pots", "I can count 1 pots" ], "response_pred": "2 pots" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many plants are in the scene?", "response_gt": [ "6", "6 plants", "I see 6", "I see 6 plants", "I can count 6 plants" ], "response_pred": "three" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many door handles are in the scene?", "response_gt": [ "1", "1 door handles", "I see 1", "I see 1 door handles", "I can count 1 door handles" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many bags are in the scene?", "response_gt": [ "2", "2 bags", "I see 2", "I see 2 bags", "I can count 2 bags" ], "response_pred": "2 bags" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many yoga mats are in the scene?", "response_gt": [ "1", "1 yoga mats", "I see 1", "I see 1 yoga mats", "I can count 1 yoga mats" ], "response_pred": "2 yoga mats" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many guitars are in the scene?", "response_gt": [ "1", "1 guitars", "I see 1", "I see 1 guitars", "I can count 1 guitars" ], "response_pred": "2 guitars" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many photo frames are in the scene?", "response_gt": [ "2", "2 photo frames", "I see 2", "I see 2 photo frames", "I can count 2 photo frames" ], "response_pred": "4 photo frames" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "2", "2 shelfs", "I see 2", "I see 2 shelfs", "I can count 2 shelfs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "17", "17 objects", "I see 17", "I see 17 objects", "I can count 17 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "2", "2 tables", "I see 2", "I see 2 tables", "I can count 2 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many table mats are in the scene?", "response_gt": [ "1", "1 table mats", "I see 1", "I see 1 table mats", "I can count 1 table mats" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many pictures are in the scene?", "response_gt": [ "1", "1 pictures", "I see 1", "I see 1 pictures", "I can count 1 pictures" ], "response_pred": "3 pictures" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many pegboards are in the scene?", "response_gt": [ "1", "1 pegboards", "I see 1", "I see 1 pegboards", "I can count 1 pegboards" ], "response_pred": "4 pegboards" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many papers are in the scene?", "response_gt": [ "9", "9 papers", "I see 9", "I see 9 papers", "I can count 9 papers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many lamps are in the scene?", "response_gt": [ "1", "1 lamps", "I see 1", "I see 1 lamps", "I can count 1 lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many pen holders are in the scene?", "response_gt": [ "1", "1 pen holders", "I see 1", "I see 1 pen holders", "I can count 1 pen holders" ], "response_pred": "3 pen holders" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "4", "4 boxs", "I see 4", "I see 4 boxs", "I can count 4 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many plant pots are in the scene?", "response_gt": [ "6", "6 plant pots", "I see 6", "I see 6 plant pots", "I can count 6 plant pots" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many toilet papers are in the scene?", "response_gt": [ "6", "6 toilet papers", "I see 6", "I see 6 toilet papers", "I can count 6 toilet papers" ], "response_pred": "2 toilet papers" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many plant pot mats are in the scene?", "response_gt": [ "3", "3 plant pot mats", "I see 3", "I see 3 plant pot mats", "I can count 3 plant pot mats" ], "response_pred": "4 plant pot mats" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many spray bottles are in the scene?", "response_gt": [ "1", "1 spray bottles", "I see 1", "I see 1 spray bottles", "I can count 1 spray bottles" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many plates are in the scene?", "response_gt": [ "1", "1 plates", "I see 1", "I see 1 plates", "I can count 1 plates" ], "response_pred": "2 plates" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many plant pot coasters are in the scene?", "response_gt": [ "2", "2 plant pot coasters", "I see 2", "I see 2 plant pot coasters", "I can count 2 plant pot coasters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many floor lamps are in the scene?", "response_gt": [ "1", "1 floor lamps", "I see 1", "I see 1 floor lamps", "I can count 1 floor lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many boards are in the scene?", "response_gt": [ "1", "1 boards", "I see 1", "I see 1 boards", "I can count 1 boards" ], "response_pred": "4 boards" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "1", "1 ceiling lamps", "I see 1", "I see 1 ceiling lamps", "I can count 1 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many mirrors are in the scene?", "response_gt": [ "1", "1 mirrors", "I see 1", "I see 1 mirrors", "I can count 1 mirrors" ], "response_pred": "2 mirrors" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many bookss are in the scene?", "response_gt": [ "1", "1 bookss", "I see 1", "I see 1 bookss", "I can count 1 bookss" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many dumbbells are in the scene?", "response_gt": [ "4", "4 dumbbells", "I see 4", "I see 4 dumbbells", "I can count 4 dumbbells" ], "response_pred": "2 dumbbells" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many cosmetic bags are in the scene?", "response_gt": [ "3", "3 cosmetic bags", "I see 3", "I see 3 cosmetic bags", "I can count 3 cosmetic bags" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many lint rollers are in the scene?", "response_gt": [ "1", "1 lint rollers", "I see 1", "I see 1 lint rollers", "I can count 1 lint rollers" ], "response_pred": "2 lint rollers" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many bunny chocolates are in the scene?", "response_gt": [ "3", "3 bunny chocolates", "I see 3", "I see 3 bunny chocolates", "I can count 3 bunny chocolates" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "1", "1 bottles", "I see 1", "I see 1 bottles", "I can count 1 bottles" ], "response_pred": "6 bottles" }, { "source": "scannet", "scene_id": "5942004064", "instruction": " USER: How many blankets are in the scene?", "response_gt": [ "1", "1 blankets", "I see 1", "I see 1 blankets", "I can count 1 blankets" ], "response_pred": "2 blankets" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many books are in the scene?", "response_gt": [ "7", "7 books", "I see 7", "I see 7 books", "I can count 7 books" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many laptops are in the scene?", "response_gt": [ "1", "1 laptops", "I see 1", "I see 1 laptops", "I can count 1 laptops" ], "response_pred": "2 laptops" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many clothess are in the scene?", "response_gt": [ "1", "1 clothess", "I see 1", "I see 1 clothess", "I can count 1 clothess" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "1", "1 heaters", "I see 1", "I see 1 heaters", "I can count 1 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many office chairs are in the scene?", "response_gt": [ "1", "1 office chairs", "I see 1", "I see 1 office chairs", "I can count 1 office chairs" ], "response_pred": "4 office chairs" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "1", "1 tables", "I see 1", "I see 1 tables", "I can count 1 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many speakers are in the scene?", "response_gt": [ "2", "2 speakers", "I see 2", "I see 2 speakers", "I can count 2 speakers" ], "response_pred": "2 speakers" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many bags are in the scene?", "response_gt": [ "2", "2 bags", "I see 2", "I see 2 bags", "I can count 2 bags" ], "response_pred": "2 bags" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "5", "5 shelfs", "I see 5", "I see 5 shelfs", "I can count 5 shelfs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "1", "1 trash bins", "I see 1", "I see 1 trash bins", "I can count 1 trash bins" ], "response_pred": "4 trash bins" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many brooms are in the scene?", "response_gt": [ "1", "1 brooms", "I see 1", "I see 1 brooms", "I can count 1 brooms" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many sliding doors are in the scene?", "response_gt": [ "2", "2 sliding doors", "I see 2", "I see 2 sliding doors", "I can count 2 sliding doors" ], "response_pred": "2 sliding doors" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many shoe racks are in the scene?", "response_gt": [ "1", "1 shoe racks", "I see 1", "I see 1 shoe racks", "I can count 1 shoe racks" ], "response_pred": "3 shoe racks" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "3", "3 objects", "I see 3", "I see 3 objects", "I can count 3 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "1", "1 smoke detectors", "I see 1", "I see 1 smoke detectors", "I can count 1 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many shoess are in the scene?", "response_gt": [ "10", "10 shoess", "I see 10", "I see 10 shoess", "I can count 10 shoess" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many shower walls are in the scene?", "response_gt": [ "7", "7 shower walls", "I see 7", "I see 7 shower walls", "I can count 7 shower walls" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many lights are in the scene?", "response_gt": [ "2", "2 lights", "I see 2", "I see 2 lights", "I can count 2 lights" ], "response_pred": "2 lights" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many toilet paper holders are in the scene?", "response_gt": [ "1", "1 toilet paper holders", "I see 1", "I see 1 toilet paper holders", "I can count 1 toilet paper holders" ], "response_pred": "2 toilet paper holders" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "1", "1 sinks", "I see 1", "I see 1 sinks", "I can count 1 sinks" ], "response_pred": "2 sinks" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many baskets are in the scene?", "response_gt": [ "1", "1 baskets", "I see 1", "I see 1 baskets", "I can count 1 baskets" ], "response_pred": "4 baskets" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many towels are in the scene?", "response_gt": [ "2", "2 towels", "I see 2", "I see 2 towels", "I can count 2 towels" ], "response_pred": "2 towels" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many shower curtains are in the scene?", "response_gt": [ "1", "1 shower curtains", "I see 1", "I see 1 shower curtains", "I can count 1 shower curtains" ], "response_pred": "2 shower curtains" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many toilet flush buttons are in the scene?", "response_gt": [ "1", "1 toilet flush buttons", "I see 1", "I see 1 toilet flush buttons", "I can count 1 toilet flush buttons" ], "response_pred": "2 toilet flush buttons" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many glasss are in the scene?", "response_gt": [ "1", "1 glasss", "I see 1", "I see 1 glasss", "I can count 1 glasss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many shower floors are in the scene?", "response_gt": [ "1", "1 shower floors", "I see 1", "I see 1 shower floors", "I can count 1 shower floors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "1", "1 bottles", "I see 1", "I see 1 bottles", "I can count 1 bottles" ], "response_pred": "6 bottles" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many posters are in the scene?", "response_gt": [ "2", "2 posters", "I see 2", "I see 2 posters", "I can count 2 posters" ], "response_pred": "4 posters" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many headphones are in the scene?", "response_gt": [ "1", "1 headphones", "I see 1", "I see 1 headphones", "I can count 1 headphones" ], "response_pred": "2 headphones" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many backpacks are in the scene?", "response_gt": [ "2", "2 backpacks", "I see 2", "I see 2 backpacks", "I can count 2 backpacks" ], "response_pred": "2 backpacks" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many beds are in the scene?", "response_gt": [ "1", "1 beds", "I see 1", "I see 1 beds", "I can count 1 beds" ], "response_pred": "2 beds" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many carpets are in the scene?", "response_gt": [ "1", "1 carpets", "I see 1", "I see 1 carpets", "I can count 1 carpets" ], "response_pred": "2 carpets" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many blankets are in the scene?", "response_gt": [ "1", "1 blankets", "I see 1", "I see 1 blankets", "I can count 1 blankets" ], "response_pred": "4 blankets" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many pillows are in the scene?", "response_gt": [ "2", "2 pillows", "I see 2", "I see 2 pillows", "I can count 2 pillows" ], "response_pred": "4 pillows" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many curtains are in the scene?", "response_gt": [ "1", "1 curtains", "I see 1", "I see 1 curtains", "I can count 1 curtains" ], "response_pred": "2 curtains" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many standing lamps are in the scene?", "response_gt": [ "1", "1 standing lamps", "I see 1", "I see 1 standing lamps", "I can count 1 standing lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many toilets are in the scene?", "response_gt": [ "1", "1 toilets", "I see 1", "I see 1 toilets", "I can count 1 toilets" ], "response_pred": "2 toilets" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many cloths are in the scene?", "response_gt": [ "1", "1 cloths", "I see 1", "I see 1 cloths", "I can count 1 cloths" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "1", "1 boxs", "I see 1", "I see 1 boxs", "I can count 1 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many mugs are in the scene?", "response_gt": [ "1", "1 mugs", "I see 1", "I see 1 mugs", "I can count 1 mugs" ], "response_pred": "2 mugs" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "1", "1 chairs", "I see 1", "I see 1 chairs", "I can count 1 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many pedestal fans are in the scene?", "response_gt": [ "1", "1 pedestal fans", "I see 1", "I see 1 pedestal fans", "I can count 1 pedestal fans" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many laundry baskets are in the scene?", "response_gt": [ "1", "1 laundry baskets", "I see 1", "I see 1 laundry baskets", "I can count 1 laundry baskets" ], "response_pred": "4 laundry baskets" }, { "source": "scannet", "scene_id": "25f3b7a318", "instruction": " USER: How many intercom devices are in the scene?", "response_gt": [ "1", "1 intercom devices", "I see 1", "I see 1 intercom devices", "I can count 1 intercom devices" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many decorations are in the scene?", "response_gt": [ "2", "2 decorations", "I see 2", "I see 2 decorations", "I can count 2 decorations" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many jackets are in the scene?", "response_gt": [ "1", "1 jackets", "I see 1", "I see 1 jackets", "I can count 1 jackets" ], "response_pred": "3 jackets" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many foldable closets are in the scene?", "response_gt": [ "1", "1 foldable closets", "I see 1", "I see 1 foldable closets", "I can count 1 foldable closets" ], "response_pred": "2 foldable closets" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "3", "3 cabinets", "I see 3", "I see 3 cabinets", "I can count 3 cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many shoess are in the scene?", "response_gt": [ "2", "2 shoess", "I see 2", "I see 2 shoess", "I can count 2 shoess" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "1", "1 heaters", "I see 1", "I see 1 heaters", "I can count 1 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many blankets are in the scene?", "response_gt": [ "2", "2 blankets", "I see 2", "I see 2 blankets", "I can count 2 blankets" ], "response_pred": "2 blankets" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "1", "1 objects", "I see 1", "I see 1 objects", "I can count 1 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many pictures are in the scene?", "response_gt": [ "1", "1 pictures", "I see 1", "I see 1 pictures", "I can count 1 pictures" ], "response_pred": "2 pictures" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many pillows are in the scene?", "response_gt": [ "1", "1 pillows", "I see 1", "I see 1 pillows", "I can count 1 pillows" ], "response_pred": "4 pillows" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many plant pots are in the scene?", "response_gt": [ "1", "1 plant pots", "I see 1", "I see 1 plant pots", "I can count 1 plant pots" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many plants are in the scene?", "response_gt": [ "1", "1 plants", "I see 1", "I see 1 plants", "I can count 1 plants" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "1", "1 smoke detectors", "I see 1", "I see 1 smoke detectors", "I can count 1 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "1", "1 chairs", "I see 1", "I see 1 chairs", "I can count 1 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "2", "2 windows", "I see 2", "I see 2 windows", "I can count 2 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "1", "1 ceiling lamps", "I see 1", "I see 1 ceiling lamps", "I can count 1 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many splits are in the scene?", "response_gt": [ "1", "1 splits", "I see 1", "I see 1 splits", "I can count 1 splits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many beds are in the scene?", "response_gt": [ "1", "1 beds", "I see 1", "I see 1 beds", "I can count 1 beds" ], "response_pred": "2 beds" }, { "source": "scannet", "scene_id": "bcd2436daf", "instruction": " USER: How many suitcases are in the scene?", "response_gt": [ "1", "1 suitcases", "I see 1", "I see 1 suitcases", "I can count 1 suitcases" ], "response_pred": "2 suitcases" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many storage ecabinets are in the scene?", "response_gt": [ "1", "1 storage ecabinets", "I see 1", "I see 1 storage ecabinets", "I can count 1 storage ecabinets" ], "response_pred": "3 storage bins" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many storage cabinets are in the scene?", "response_gt": [ "1", "1 storage cabinets", "I see 1", "I see 1 storage cabinets", "I can count 1 storage cabinets" ], "response_pred": "3 storage cabinets" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many crates are in the scene?", "response_gt": [ "9", "9 crates", "I see 9", "I see 9 crates", "I can count 9 crates" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "17", "17 chairs", "I see 17", "I see 17 chairs", "I can count 17 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "7", "7 tables", "I see 7", "I see 7 tables", "I can count 7 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "2", "2 doors", "I see 2", "I see 2 doors", "I can count 2 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many bookshelfs are in the scene?", "response_gt": [ "1", "1 bookshelfs", "I see 1", "I see 1 bookshelfs", "I can count 1 bookshelfs" ], "response_pred": "3 bookshelf" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "1", "1 smoke detectors", "I see 1", "I see 1 smoke detectors", "I can count 1 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "2", "2 ceiling lamps", "I see 2", "I see 2 ceiling lamps", "I can count 2 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many interactive whiteboards are in the scene?", "response_gt": [ "1", "1 interactive whiteboards", "I see 1", "I see 1 interactive whiteboards", "I can count 1 interactive whiteboards" ], "response_pred": "2 interactive whiteboards" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many outlets are in the scene?", "response_gt": [ "3", "3 outlets", "I see 3", "I see 3 outlets", "I can count 3 outlets" ], "response_pred": "2 outlets" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "29", "29 objects", "I see 29", "I see 29 objects", "I can count 29 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many power strips are in the scene?", "response_gt": [ "1", "1 power strips", "I see 1", "I see 1 power strips", "I can count 1 power strips" ], "response_pred": "three" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many boards are in the scene?", "response_gt": [ "1", "1 boards", "I see 1", "I see 1 boards", "I can count 1 boards" ], "response_pred": "4 boards" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many vacuum cleaners are in the scene?", "response_gt": [ "1", "1 vacuum cleaners", "I see 1", "I see 1 vacuum cleaners", "I can count 1 vacuum cleaners" ], "response_pred": "2 vacuum cleaners" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "13", "13 bottles", "I see 13", "I see 13 bottles", "I can count 13 bottles" ], "response_pred": "4 bottles" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many bottless are in the scene?", "response_gt": [ "3", "3 bottless", "I see 3", "I see 3 bottless", "I can count 3 bottless" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many bags are in the scene?", "response_gt": [ "3", "3 bags", "I see 3", "I see 3 bags", "I can count 3 bags" ], "response_pred": "3 bags" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many blindss are in the scene?", "response_gt": [ "1", "1 blindss", "I see 1", "I see 1 blindss", "I can count 1 blindss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "1", "1 heaters", "I see 1", "I see 1 heaters", "I can count 1 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "3", "3 trash cans", "I see 3", "I see 3 trash cans", "I can count 3 trash cans" ], "response_pred": "4 trash cans" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many clocks are in the scene?", "response_gt": [ "1", "1 clocks", "I see 1", "I see 1 clocks", "I can count 1 clocks" ], "response_pred": "2 clocks" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "1", "1 whiteboards", "I see 1", "I see 1 whiteboards", "I can count 1 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many posters are in the scene?", "response_gt": [ "4", "4 posters", "I see 4", "I see 4 posters", "I can count 4 posters" ], "response_pred": "4 posters" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many magnetss are in the scene?", "response_gt": [ "1", "1 magnetss", "I see 1", "I see 1 magnetss", "I can count 1 magnetss" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "1", "1 cabinets", "I see 1", "I see 1 cabinets", "I can count 1 cabinets" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many microwaves are in the scene?", "response_gt": [ "1", "1 microwaves", "I see 1", "I see 1 microwaves", "I can count 1 microwaves" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "6", "6 boxs", "I see 6", "I see 6 boxs", "I can count 6 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many cups are in the scene?", "response_gt": [ "1", "1 cups", "I see 1", "I see 1 cups", "I can count 1 cups" ], "response_pred": "2 cups" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many jugs are in the scene?", "response_gt": [ "2", "2 jugs", "I see 2", "I see 2 jugs", "I can count 2 jugs" ], "response_pred": "4 jugs" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many coffee machines are in the scene?", "response_gt": [ "1", "1 coffee machines", "I see 1", "I see 1 coffee machines", "I can count 1 coffee machines" ], "response_pred": "2 coffee machines" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many papers are in the scene?", "response_gt": [ "1", "1 papers", "I see 1", "I see 1 papers", "I can count 1 papers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many buckets are in the scene?", "response_gt": [ "1", "1 buckets", "I see 1", "I see 1 buckets", "I can count 1 buckets" ], "response_pred": "3 buckets" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many platess are in the scene?", "response_gt": [ "2", "2 platess", "I see 2", "I see 2 platess", "I can count 2 platess" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many bowls are in the scene?", "response_gt": [ "2", "2 bowls", "I see 2", "I see 2 bowls", "I can count 2 bowls" ], "response_pred": "2 bowls" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many french presss are in the scene?", "response_gt": [ "1", "1 french presss", "I see 1", "I see 1 french presss", "I can count 1 french presss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many towels are in the scene?", "response_gt": [ "1", "1 towels", "I see 1", "I see 1 towels", "I can count 1 towels" ], "response_pred": "2 towels" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many bookss are in the scene?", "response_gt": [ "1", "1 bookss", "I see 1", "I see 1 bookss", "I can count 1 bookss" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many plant pots are in the scene?", "response_gt": [ "1", "1 plant pots", "I see 1", "I see 1 plant pots", "I can count 1 plant pots" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "f3d64c30f8", "instruction": " USER: How many plants are in the scene?", "response_gt": [ "1", "1 plants", "I see 1", "I see 1 plants", "I can count 1 plants" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many baseball caps are in the scene?", "response_gt": [ "1", "1 baseball caps", "I see 1", "I see 1 baseball caps", "I can count 1 baseball caps" ], "response_pred": "2 baseball caps" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "10", "10 boxs", "I see 10", "I see 10 boxs", "I can count 10 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many bags are in the scene?", "response_gt": [ "1", "1 bags", "I see 1", "I see 1 bags", "I can count 1 bags" ], "response_pred": "3 bags" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "1", "1 cabinets", "I see 1", "I see 1 cabinets", "I can count 1 cabinets" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many laundry hampers are in the scene?", "response_gt": [ "1", "1 laundry hampers", "I see 1", "I see 1 laundry hampers", "I can count 1 laundry hampers" ], "response_pred": "4 laundry hampers" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many pictures are in the scene?", "response_gt": [ "1", "1 pictures", "I see 1", "I see 1 pictures", "I can count 1 pictures" ], "response_pred": "3 pictures" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many jackets are in the scene?", "response_gt": [ "2", "2 jackets", "I see 2", "I see 2 jackets", "I can count 2 jackets" ], "response_pred": "3 jackets" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many kitchen utensils are in the scene?", "response_gt": [ "4", "4 kitchen utensils", "I see 4", "I see 4 kitchen utensils", "I can count 4 kitchen utensils" ], "response_pred": "2 kitchen utensils" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "6", "6 objects", "I see 6", "I see 6 objects", "I can count 6 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many duffle bags are in the scene?", "response_gt": [ "1", "1 duffle bags", "I see 1", "I see 1 duffle bags", "I can count 1 duffle bags" ], "response_pred": "2 duffle bags" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many mugs are in the scene?", "response_gt": [ "2", "2 mugs", "I see 2", "I see 2 mugs", "I can count 2 mugs" ], "response_pred": "2 mugs" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "2", "2 bottles", "I see 2", "I see 2 bottles", "I can count 2 bottles" ], "response_pred": "6 bottles" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many scissorss are in the scene?", "response_gt": [ "1", "1 scissorss", "I see 1", "I see 1 scissorss", "I can count 1 scissorss" ], "response_pred": "2 scissorss" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many cutting boards are in the scene?", "response_gt": [ "1", "1 cutting boards", "I see 1", "I see 1 cutting boards", "I can count 1 cutting boards" ], "response_pred": "2 cutting boards" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many kitchen cabinets are in the scene?", "response_gt": [ "2", "2 kitchen cabinets", "I see 2", "I see 2 kitchen cabinets", "I can count 2 kitchen cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many kitchen counters are in the scene?", "response_gt": [ "1", "1 kitchen counters", "I see 1", "I see 1 kitchen counters", "I can count 1 kitchen counters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many smoke alarms are in the scene?", "response_gt": [ "1", "1 smoke alarms", "I see 1", "I see 1 smoke alarms", "I can count 1 smoke alarms" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many bluetooth speakers are in the scene?", "response_gt": [ "1", "1 bluetooth speakers", "I see 1", "I see 1 bluetooth speakers", "I can count 1 bluetooth speakers" ], "response_pred": "3 bluetooth speakers" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "1", "1 monitors", "I see 1", "I see 1 monitors", "I can count 1 monitors" ], "response_pred": "3 monitors" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many desk lamps are in the scene?", "response_gt": [ "1", "1 desk lamps", "I see 1", "I see 1 desk lamps", "I can count 1 desk lamps" ], "response_pred": "2 desk lamps" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "1", "1 tables", "I see 1", "I see 1 tables", "I can count 1 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many office chairs are in the scene?", "response_gt": [ "1", "1 office chairs", "I see 1", "I see 1 office chairs", "I can count 1 office chairs" ], "response_pred": "4 office chairs" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many sweaters are in the scene?", "response_gt": [ "1", "1 sweaters", "I see 1", "I see 1 sweaters", "I can count 1 sweaters" ], "response_pred": "2 sweaters" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many nightstands are in the scene?", "response_gt": [ "1", "1 nightstands", "I see 1", "I see 1 nightstands", "I can count 1 nightstands" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many backpacks are in the scene?", "response_gt": [ "2", "2 backpacks", "I see 2", "I see 2 backpacks", "I can count 2 backpacks" ], "response_pred": "2 backpacks" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many pillows are in the scene?", "response_gt": [ "3", "3 pillows", "I see 3", "I see 3 pillows", "I can count 3 pillows" ], "response_pred": "4 pillows" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "1", "1 chairs", "I see 1", "I see 1 chairs", "I can count 1 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "1", "1 keyboards", "I see 1", "I see 1 keyboards", "I can count 1 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many blankets are in the scene?", "response_gt": [ "2", "2 blankets", "I see 2", "I see 2 blankets", "I can count 2 blankets" ], "response_pred": "3 blankets" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many beds are in the scene?", "response_gt": [ "1", "1 beds", "I see 1", "I see 1 beds", "I can count 1 beds" ], "response_pred": "2 beds" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many mattresss are in the scene?", "response_gt": [ "1", "1 mattresss", "I see 1", "I see 1 mattresss", "I can count 1 mattresss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many splits are in the scene?", "response_gt": [ "1", "1 splits", "I see 1", "I see 1 splits", "I can count 1 splits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many laptops are in the scene?", "response_gt": [ "1", "1 laptops", "I see 1", "I see 1 laptops", "I can count 1 laptops" ], "response_pred": "2 laptops" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "1", "1 trash cans", "I see 1", "I see 1 trash cans", "I can count 1 trash cans" ], "response_pred": "4 trash cans" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many book shelfs are in the scene?", "response_gt": [ "1", "1 book shelfs", "I see 1", "I see 1 book shelfs", "I can count 1 book shelfs" ], "response_pred": "2 bookshelf" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many monitor lights are in the scene?", "response_gt": [ "1", "1 monitor lights", "I see 1", "I see 1 monitor lights", "I can count 1 monitor lights" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many papers are in the scene?", "response_gt": [ "3", "3 papers", "I see 3", "I see 3 papers", "I can count 3 papers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many sculptures are in the scene?", "response_gt": [ "2", "2 sculptures", "I see 2", "I see 2 sculptures", "I can count 2 sculptures" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many toilet papers are in the scene?", "response_gt": [ "1", "1 toilet papers", "I see 1", "I see 1 toilet papers", "I can count 1 toilet papers" ], "response_pred": "2 toilet papers" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many receipts are in the scene?", "response_gt": [ "1", "1 receipts", "I see 1", "I see 1 receipts", "I can count 1 receipts" ], "response_pred": "2 receipts" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many erasers are in the scene?", "response_gt": [ "1", "1 erasers", "I see 1", "I see 1 erasers", "I can count 1 erasers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many mouses are in the scene?", "response_gt": [ "1", "1 mouses", "I see 1", "I see 1 mouses", "I can count 1 mouses" ], "response_pred": "2 mouses" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many mousepads are in the scene?", "response_gt": [ "1", "1 mousepads", "I see 1", "I see 1 mousepads", "I can count 1 mousepads" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many towels are in the scene?", "response_gt": [ "2", "2 towels", "I see 2", "I see 2 towels", "I can count 2 towels" ], "response_pred": "2 towels" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "2", "2 shelfs", "I see 2", "I see 2 shelfs", "I can count 2 shelfs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many kettles are in the scene?", "response_gt": [ "1", "1 kettles", "I see 1", "I see 1 kettles", "I can count 1 kettles" ], "response_pred": "2 kettles" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many books are in the scene?", "response_gt": [ "1", "1 books", "I see 1", "I see 1 books", "I can count 1 books" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many instant pots are in the scene?", "response_gt": [ "1", "1 instant pots", "I see 1", "I see 1 instant pots", "I can count 1 instant pots" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many toasters are in the scene?", "response_gt": [ "1", "1 toasters", "I see 1", "I see 1 toasters", "I can count 1 toasters" ], "response_pred": "2 toasters" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many salad spinners are in the scene?", "response_gt": [ "1", "1 salad spinners", "I see 1", "I see 1 salad spinners", "I can count 1 salad spinners" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many pans are in the scene?", "response_gt": [ "2", "2 pans", "I see 2", "I see 2 pans", "I can count 2 pans" ], "response_pred": "4 pans" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many hand vacuums are in the scene?", "response_gt": [ "1", "1 hand vacuums", "I see 1", "I see 1 hand vacuums", "I can count 1 hand vacuums" ], "response_pred": "2 hand vacuums" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many microwaves are in the scene?", "response_gt": [ "1", "1 microwaves", "I see 1", "I see 1 microwaves", "I can count 1 microwaves" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many water filters are in the scene?", "response_gt": [ "1", "1 water filters", "I see 1", "I see 1 water filters", "I can count 1 water filters" ], "response_pred": "2 water filters" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "2", "2 windows", "I see 2", "I see 2 windows", "I can count 2 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many panels are in the scene?", "response_gt": [ "1", "1 panels", "I see 1", "I see 1 panels", "I can count 1 panels" ], "response_pred": "4 panels" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many clothess are in the scene?", "response_gt": [ "2", "2 clothess", "I see 2", "I see 2 clothess", "I can count 2 clothess" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "0d2ee665be", "instruction": " USER: How many bookss are in the scene?", "response_gt": [ "1", "1 bookss", "I see 1", "I see 1 bookss", "I can count 1 bookss" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many roofs are in the scene?", "response_gt": [ "3", "3 roofs", "I see 3", "I see 3 roofs", "I can count 3 roofs" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many pillars are in the scene?", "response_gt": [ "1", "1 pillars", "I see 1", "I see 1 pillars", "I can count 1 pillars" ], "response_pred": "2 pillars" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "6", "6 windows", "I see 6", "I see 6 windows", "I can count 6 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many removes are in the scene?", "response_gt": [ "4", "4 removes", "I see 4", "I see 4 removes", "I can count 4 removes" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "1", "1 heaters", "I see 1", "I see 1 heaters", "I can count 1 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "1", "1 cabinets", "I see 1", "I see 1 cabinets", "I can count 1 cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many refrigerators are in the scene?", "response_gt": [ "1", "1 refrigerators", "I see 1", "I see 1 refrigerators", "I can count 1 refrigerators" ], "response_pred": "2 refrigerators" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "4", "4 chairs", "I see 4", "I see 4 chairs", "I can count 4 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many sofas are in the scene?", "response_gt": [ "1", "1 sofas", "I see 1", "I see 1 sofas", "I can count 1 sofas" ], "response_pred": "2 sofas" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "5", "5 tables", "I see 5", "I see 5 tables", "I can count 5 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many piano chairs are in the scene?", "response_gt": [ "1", "1 piano chairs", "I see 1", "I see 1 piano chairs", "I can count 1 piano chairs" ], "response_pred": "2 piano chairs" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many bathtubs are in the scene?", "response_gt": [ "1", "1 bathtubs", "I see 1", "I see 1 bathtubs", "I can count 1 bathtubs" ], "response_pred": "2 bathtubs" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "3", "3 ceiling lamps", "I see 3", "I see 3 ceiling lamps", "I can count 3 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "2", "2 doors", "I see 2", "I see 2 doors", "I can count 2 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many pianos are in the scene?", "response_gt": [ "1", "1 pianos", "I see 1", "I see 1 pianos", "I can count 1 pianos" ], "response_pred": "2 pianos" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many beds are in the scene?", "response_gt": [ "1", "1 beds", "I see 1", "I see 1 beds", "I can count 1 beds" ], "response_pred": "2 beds" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many piano books are in the scene?", "response_gt": [ "1", "1 piano books", "I see 1", "I see 1 piano books", "I can count 1 piano books" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many headsets are in the scene?", "response_gt": [ "1", "1 headsets", "I see 1", "I see 1 headsets", "I can count 1 headsets" ], "response_pred": "2 headsets" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many beanbags are in the scene?", "response_gt": [ "2", "2 beanbags", "I see 2", "I see 2 beanbags", "I can count 2 beanbags" ], "response_pred": "2 beanbags" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "1", "1 shelfs", "I see 1", "I see 1 shelfs", "I can count 1 shelfs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many cartons are in the scene?", "response_gt": [ "1", "1 cartons", "I see 1", "I see 1 cartons", "I can count 1 cartons" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "5", "5 boxs", "I see 5", "I see 5 boxs", "I can count 5 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many books are in the scene?", "response_gt": [ "1", "1 books", "I see 1", "I see 1 books", "I can count 1 books" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many mouses are in the scene?", "response_gt": [ "1", "1 mouses", "I see 1", "I see 1 mouses", "I can count 1 mouses" ], "response_pred": "2 mouses" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many air conditioners are in the scene?", "response_gt": [ "1", "1 air conditioners", "I see 1", "I see 1 air conditioners", "I can count 1 air conditioners" ], "response_pred": "2 air conditioners" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many smok alarms are in the scene?", "response_gt": [ "1", "1 smok alarms", "I see 1", "I see 1 smok alarms", "I can count 1 smok alarms" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many drying racks are in the scene?", "response_gt": [ "3", "3 drying racks", "I see 3", "I see 3 drying racks", "I can count 3 drying racks" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many tv stands are in the scene?", "response_gt": [ "1", "1 tv stands", "I see 1", "I see 1 tv stands", "I can count 1 tv stands" ], "response_pred": "2 tv stands" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "1", "1 keyboards", "I see 1", "I see 1 keyboards", "I can count 1 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many tvs are in the scene?", "response_gt": [ "1", "1 tvs", "I see 1", "I see 1 tvs", "I can count 1 tvs" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many tv receivers are in the scene?", "response_gt": [ "1", "1 tv receivers", "I see 1", "I see 1 tv receivers", "I can count 1 tv receivers" ], "response_pred": "2 tv receivers" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many backpacks are in the scene?", "response_gt": [ "1", "1 backpacks", "I see 1", "I see 1 backpacks", "I can count 1 backpacks" ], "response_pred": "2 backpacks" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many bookss are in the scene?", "response_gt": [ "1", "1 bookss", "I see 1", "I see 1 bookss", "I can count 1 bookss" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "4", "4 objects", "I see 4", "I see 4 objects", "I can count 4 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "19", "19 bottles", "I see 19", "I see 19 bottles", "I can count 19 bottles" ], "response_pred": "6 bottles" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "2", "2 trash bins", "I see 2", "I see 2 trash bins", "I can count 2 trash bins" ], "response_pred": "4 trash bins" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many bags are in the scene?", "response_gt": [ "6", "6 bags", "I see 6", "I see 6 bags", "I can count 6 bags" ], "response_pred": "3 bags" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many printers are in the scene?", "response_gt": [ "1", "1 printers", "I see 1", "I see 1 printers", "I can count 1 printers" ], "response_pred": "2 printers" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many bottless are in the scene?", "response_gt": [ "1", "1 bottless", "I see 1", "I see 1 bottless", "I can count 1 bottless" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many shoes are in the scene?", "response_gt": [ "14", "14 shoes", "I see 14", "I see 14 shoes", "I can count 14 shoes" ], "response_pred": "3 shoes" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many brooms are in the scene?", "response_gt": [ "3", "3 brooms", "I see 3", "I see 3 brooms", "I can count 3 brooms" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many jackets are in the scene?", "response_gt": [ "4", "4 jackets", "I see 4", "I see 4 jackets", "I can count 4 jackets" ], "response_pred": "3 jackets" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many kitchen counters are in the scene?", "response_gt": [ "1", "1 kitchen counters", "I see 1", "I see 1 kitchen counters", "I can count 1 kitchen counters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many ovens are in the scene?", "response_gt": [ "1", "1 ovens", "I see 1", "I see 1 ovens", "I can count 1 ovens" ], "response_pred": "2 ovens" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many kitchen cabinets are in the scene?", "response_gt": [ "1", "1 kitchen cabinets", "I see 1", "I see 1 kitchen cabinets", "I can count 1 kitchen cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many kettles are in the scene?", "response_gt": [ "1", "1 kettles", "I see 1", "I see 1 kettles", "I can count 1 kettles" ], "response_pred": "2 kettles" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many bowls are in the scene?", "response_gt": [ "1", "1 bowls", "I see 1", "I see 1 bowls", "I can count 1 bowls" ], "response_pred": "2 bowls" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "2", "2 sinks", "I see 2", "I see 2 sinks", "I can count 2 sinks" ], "response_pred": "2 sinks" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many taps are in the scene?", "response_gt": [ "2", "2 taps", "I see 2", "I see 2 taps", "I can count 2 taps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many cutting boards are in the scene?", "response_gt": [ "1", "1 cutting boards", "I see 1", "I see 1 cutting boards", "I can count 1 cutting boards" ], "response_pred": "4 cutting boards" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many plates are in the scene?", "response_gt": [ "2", "2 plates", "I see 2", "I see 2 plates", "I can count 2 plates" ], "response_pred": "2 plates" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many baskets are in the scene?", "response_gt": [ "4", "4 baskets", "I see 4", "I see 4 baskets", "I can count 4 baskets" ], "response_pred": "4 baskets" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many pans are in the scene?", "response_gt": [ "1", "1 pans", "I see 1", "I see 1 pans", "I can count 1 pans" ], "response_pred": "4 pans" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many stoves are in the scene?", "response_gt": [ "1", "1 stoves", "I see 1", "I see 1 stoves", "I can count 1 stoves" ], "response_pred": "2 stoves" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many shower faucets are in the scene?", "response_gt": [ "1", "1 shower faucets", "I see 1", "I see 1 shower faucets", "I can count 1 shower faucets" ], "response_pred": "1 shower faucet" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many toilet seats are in the scene?", "response_gt": [ "1", "1 toilet seats", "I see 1", "I see 1 toilet seats", "I can count 1 toilet seats" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many toilet brushs are in the scene?", "response_gt": [ "1", "1 toilet brushs", "I see 1", "I see 1 toilet brushs", "I can count 1 toilet brushs" ], "response_pred": "2 toilet brush" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many toilet papers are in the scene?", "response_gt": [ "3", "3 toilet papers", "I see 3", "I see 3 toilet papers", "I can count 3 toilet papers" ], "response_pred": "2 toilet papers" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many towels are in the scene?", "response_gt": [ "1", "1 towels", "I see 1", "I see 1 towels", "I can count 1 towels" ], "response_pred": "2 towels" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many toilet paper rollss are in the scene?", "response_gt": [ "1", "1 toilet paper rollss", "I see 1", "I see 1 toilet paper rollss", "I can count 1 toilet paper rollss" ], "response_pred": "2 toilet paper rolls" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many washing machines are in the scene?", "response_gt": [ "1", "1 washing machines", "I see 1", "I see 1 washing machines", "I can count 1 washing machines" ], "response_pred": "2 washing machines" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many weighing scales are in the scene?", "response_gt": [ "1", "1 weighing scales", "I see 1", "I see 1 weighing scales", "I can count 1 weighing scales" ], "response_pred": "2 weighing scales" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many glass walls are in the scene?", "response_gt": [ "1", "1 glass walls", "I see 1", "I see 1 glass walls", "I can count 1 glass walls" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "3db0a1c8f3", "instruction": " USER: How many blankets are in the scene?", "response_gt": [ "1", "1 blankets", "I see 1", "I see 1 blankets", "I can count 1 blankets" ], "response_pred": "4 blankets" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "1", "1 trash cans", "I see 1", "I see 1 trash cans", "I can count 1 trash cans" ], "response_pred": "4 trash cans" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many office chairs are in the scene?", "response_gt": [ "7", "7 office chairs", "I see 7", "I see 7 office chairs", "I can count 7 office chairs" ], "response_pred": "4 office chairs" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many first aid boxs are in the scene?", "response_gt": [ "1", "1 first aid boxs", "I see 1", "I see 1 first aid boxs", "I can count 1 first aid boxs" ], "response_pred": "2 first aid box" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many fire extinguishers are in the scene?", "response_gt": [ "1", "1 fire extinguishers", "I see 1", "I see 1 fire extinguishers", "I can count 1 fire extinguishers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many ladders are in the scene?", "response_gt": [ "1", "1 ladders", "I see 1", "I see 1 ladders", "I can count 1 ladders" ], "response_pred": "2 ladders" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "8", "8 tables", "I see 8", "I see 8 tables", "I can count 8 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "5", "5 boxs", "I see 5", "I see 5 boxs", "I can count 5 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many softbox lights are in the scene?", "response_gt": [ "1", "1 softbox lights", "I see 1", "I see 1 softbox lights", "I can count 1 softbox lights" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "2", "2 chairs", "I see 2", "I see 2 chairs", "I can count 2 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "3", "3 doors", "I see 3", "I see 3 doors", "I can count 3 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many backpacks are in the scene?", "response_gt": [ "1", "1 backpacks", "I see 1", "I see 1 backpacks", "I can count 1 backpacks" ], "response_pred": "2 backpacks" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "6", "6 keyboards", "I see 6", "I see 6 keyboards", "I can count 6 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "2", "2 cabinets", "I see 2", "I see 2 cabinets", "I can count 2 cabinets" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "3", "3 objects", "I see 3", "I see 3 objects", "I can count 3 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many pedestal fans are in the scene?", "response_gt": [ "1", "1 pedestal fans", "I see 1", "I see 1 pedestal fans", "I can count 1 pedestal fans" ], "response_pred": "2 fans" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "7", "7 windows", "I see 7", "I see 7 windows", "I can count 7 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many storage cabinets are in the scene?", "response_gt": [ "1", "1 storage cabinets", "I see 1", "I see 1 storage cabinets", "I can count 1 storage cabinets" ], "response_pred": "3 storage cabinets" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many pillars are in the scene?", "response_gt": [ "3", "3 pillars", "I see 3", "I see 3 pillars", "I can count 3 pillars" ], "response_pred": "2 pillars" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "9", "9 monitors", "I see 9", "I see 9 monitors", "I can count 9 monitors" ], "response_pred": "3 monitors" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many mouses are in the scene?", "response_gt": [ "2", "2 mouses", "I see 2", "I see 2 mouses", "I can count 2 mouses" ], "response_pred": "2 mouses" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "4", "4 heaters", "I see 4", "I see 4 heaters", "I can count 4 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "1", "1 whiteboards", "I see 1", "I see 1 whiteboards", "I can count 1 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "ac48a9b736", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "1", "1 shelfs", "I see 1", "I see 1 shelfs", "I can count 1 shelfs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many laboratory benchs are in the scene?", "response_gt": [ "1", "1 laboratory benchs", "I see 1", "I see 1 laboratory benchs", "I can count 1 laboratory benchs" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "5", "5 cabinets", "I see 5", "I see 5 cabinets", "I can count 5 cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many electrical ducts are in the scene?", "response_gt": [ "3", "3 electrical ducts", "I see 3", "I see 3 electrical ducts", "I can count 3 electrical ducts" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many laboratory cabinets are in the scene?", "response_gt": [ "1", "1 laboratory cabinets", "I see 1", "I see 1 laboratory cabinets", "I can count 1 laboratory cabinets" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many plastic cases are in the scene?", "response_gt": [ "2", "2 plastic cases", "I see 2", "I see 2 plastic cases", "I can count 2 plastic cases" ], "response_pred": "2 plastic cases" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many machines are in the scene?", "response_gt": [ "1", "1 machines", "I see 1", "I see 1 machines", "I can count 1 machines" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many office chairs are in the scene?", "response_gt": [ "3", "3 office chairs", "I see 3", "I see 3 office chairs", "I can count 3 office chairs" ], "response_pred": "4 office chairs" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many structures are in the scene?", "response_gt": [ "3", "3 structures", "I see 3", "I see 3 structures", "I can count 3 structures" ], "response_pred": "3 structures" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "1", "1 sinks", "I see 1", "I see 1 sinks", "I can count 1 sinks" ], "response_pred": "2 sinks" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many pipes are in the scene?", "response_gt": [ "8", "8 pipes", "I see 8", "I see 8 pipes", "I can count 8 pipes" ], "response_pred": "2 pipes" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many desk lamps are in the scene?", "response_gt": [ "2", "2 desk lamps", "I see 2", "I see 2 desk lamps", "I can count 2 desk lamps" ], "response_pred": "2 desk lamps" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many first aid cabinets are in the scene?", "response_gt": [ "1", "1 first aid cabinets", "I see 1", "I see 1 first aid cabinets", "I can count 1 first aid cabinets" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many spray bottles are in the scene?", "response_gt": [ "1", "1 spray bottles", "I see 1", "I see 1 spray bottles", "I can count 1 spray bottles" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many water pipes are in the scene?", "response_gt": [ "2", "2 water pipes", "I see 2", "I see 2 water pipes", "I can count 2 water pipes" ], "response_pred": "2 water pipes" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many chemical bottles are in the scene?", "response_gt": [ "1", "1 chemical bottles", "I see 1", "I see 1 chemical bottles", "I can count 1 chemical bottles" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many soaps are in the scene?", "response_gt": [ "1", "1 soaps", "I see 1", "I see 1 soaps", "I can count 1 soaps" ], "response_pred": "2 soaps" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many faucets are in the scene?", "response_gt": [ "1", "1 faucets", "I see 1", "I see 1 faucets", "I can count 1 faucets" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many exhaust fans are in the scene?", "response_gt": [ "3", "3 exhaust fans", "I see 3", "I see 3 exhaust fans", "I can count 3 exhaust fans" ], "response_pred": "2 exhaust fans" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many sink pipes are in the scene?", "response_gt": [ "1", "1 sink pipes", "I see 1", "I see 1 sink pipes", "I can count 1 sink pipes" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many tool organizers are in the scene?", "response_gt": [ "1", "1 tool organizers", "I see 1", "I see 1 tool organizers", "I can count 1 tool organizers" ], "response_pred": "2 tool organizers" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "2", "2 trash cans", "I see 2", "I see 2 trash cans", "I can count 2 trash cans" ], "response_pred": "4 trash cans" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many cardboard boxs are in the scene?", "response_gt": [ "2", "2 cardboard boxs", "I see 2", "I see 2 cardboard boxs", "I can count 2 cardboard boxs" ], "response_pred": "4 cardboard boxes" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "8", "8 objects", "I see 8", "I see 8 objects", "I can count 8 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many oscilloscopes are in the scene?", "response_gt": [ "1", "1 oscilloscopes", "I see 1", "I see 1 oscilloscopes", "I can count 1 oscilloscopes" ], "response_pred": "2 oscilloscopes" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many laboratory equipments are in the scene?", "response_gt": [ "10", "10 laboratory equipments", "I see 10", "I see 10 laboratory equipments", "I can count 10 laboratory equipments" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many soldering irons are in the scene?", "response_gt": [ "1", "1 soldering irons", "I see 1", "I see 1 soldering irons", "I can count 1 soldering irons" ], "response_pred": "2 soldering irons" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "5", "5 boxs", "I see 5", "I see 5 boxs", "I can count 5 boxs" ], "response_pred": "4 boxes" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many binders are in the scene?", "response_gt": [ "3", "3 binders", "I see 3", "I see 3 binders", "I can count 3 binders" ], "response_pred": "3 binders" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many buttons are in the scene?", "response_gt": [ "1", "1 buttons", "I see 1", "I see 1 buttons", "I can count 1 buttons" ], "response_pred": "3 buttons" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many silicon guns are in the scene?", "response_gt": [ "1", "1 silicon guns", "I see 1", "I see 1 silicon guns", "I can count 1 silicon guns" ], "response_pred": "2 silicon guns" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many tool boxs are in the scene?", "response_gt": [ "1", "1 tool boxs", "I see 1", "I see 1 tool boxs", "I can count 1 tool boxs" ], "response_pred": "2 tool boxes" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many alligator clipss are in the scene?", "response_gt": [ "5", "5 alligator clipss", "I see 5", "I see 5 alligator clipss", "I can count 5 alligator clipss" ], "response_pred": "2 alligator clipss are in scene" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many wooden sticks are in the scene?", "response_gt": [ "1", "1 wooden sticks", "I see 1", "I see 1 wooden sticks", "I can count 1 wooden sticks" ], "response_pred": "2 wooden sticks" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many electrical pipes are in the scene?", "response_gt": [ "4", "4 electrical pipes", "I see 4", "I see 4 electrical pipes", "I can count 4 electrical pipes" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many barcode scanners are in the scene?", "response_gt": [ "1", "1 barcode scanners", "I see 1", "I see 1 barcode scanners", "I can count 1 barcode scanners" ], "response_pred": "2 barcode scanners" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many covered power sockets are in the scene?", "response_gt": [ "1", "1 covered power sockets", "I see 1", "I see 1 covered power sockets", "I can count 1 covered power sockets" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many ceiling lights are in the scene?", "response_gt": [ "1", "1 ceiling lights", "I see 1", "I see 1 ceiling lights", "I can count 1 ceiling lights" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many outlets are in the scene?", "response_gt": [ "1", "1 outlets", "I see 1", "I see 1 outlets", "I can count 1 outlets" ], "response_pred": "2 outlets" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many toolss are in the scene?", "response_gt": [ "1", "1 toolss", "I see 1", "I see 1 toolss", "I can count 1 toolss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c5439f4607", "instruction": " USER: How many cable ducts are in the scene?", "response_gt": [ "1", "1 cable ducts", "I see 1", "I see 1 cable ducts", "I can count 1 cable ducts" ], "response_pred": "2 cable ducts" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "19", "19 tables", "I see 19", "I see 19 tables", "I can count 19 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many pedestal fans are in the scene?", "response_gt": [ "2", "2 pedestal fans", "I see 2", "I see 2 pedestal fans", "I can count 2 pedestal fans" ], "response_pred": "2 fans" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "6", "6 trash bins", "I see 6", "I see 6 trash bins", "I can count 6 trash bins" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many exhaust fans are in the scene?", "response_gt": [ "1", "1 exhaust fans", "I see 1", "I see 1 exhaust fans", "I can count 1 exhaust fans" ], "response_pred": "2 exhaust fans" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "11", "11 windows", "I see 11", "I see 11 windows", "I can count 11 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many pillars are in the scene?", "response_gt": [ "1", "1 pillars", "I see 1", "I see 1 pillars", "I can count 1 pillars" ], "response_pred": "4 pillars" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "9", "9 boxs", "I see 9", "I see 9 boxs", "I can count 9 boxs" ], "response_pred": "4 boxes" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "8", "8 cabinets", "I see 8", "I see 8 cabinets", "I can count 8 cabinets" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "16", "16 monitors", "I see 16", "I see 16 monitors", "I can count 16 monitors" ], "response_pred": "3 monitors" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many emergency kits are in the scene?", "response_gt": [ "1", "1 emergency kits", "I see 1", "I see 1 emergency kits", "I can count 1 emergency kits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many fire extinguishers are in the scene?", "response_gt": [ "1", "1 fire extinguishers", "I see 1", "I see 1 fire extinguishers", "I can count 1 fire extinguishers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many wooden pallets are in the scene?", "response_gt": [ "1", "1 wooden pallets", "I see 1", "I see 1 wooden pallets", "I can count 1 wooden pallets" ], "response_pred": "4 wooden pallets" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many platform trolleys are in the scene?", "response_gt": [ "1", "1 platform trolleys", "I see 1", "I see 1 platform trolleys", "I can count 1 platform trolleys" ], "response_pred": "2 platform trolleys" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "10", "10 keyboards", "I see 10", "I see 10 keyboards", "I can count 10 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many computer towers are in the scene?", "response_gt": [ "8", "8 computer towers", "I see 8", "I see 8 computer towers", "I can count 8 computer towers" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "4", "4 whiteboards", "I see 4", "I see 4 whiteboards", "I can count 4 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "5", "5 objects", "I see 5", "I see 5 objects", "I can count 5 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many joined tabless are in the scene?", "response_gt": [ "1", "1 joined tabless", "I see 1", "I see 1 joined tabless", "I can count 1 joined tabless" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many storage cabinets are in the scene?", "response_gt": [ "7", "7 storage cabinets", "I see 7", "I see 7 storage cabinets", "I can count 7 storage cabinets" ], "response_pred": "3 storage cabinets" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many floor mounted air conditioners are in the scene?", "response_gt": [ "2", "2 floor mounted air conditioners", "I see 2", "I see 2 floor mounted air conditioners", "I can count 2 floor mounted air conditioners" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "1", "1 chairs", "I see 1", "I see 1 chairs", "I can count 1 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many office chairs are in the scene?", "response_gt": [ "10", "10 office chairs", "I see 10", "I see 10 office chairs", "I can count 10 office chairs" ], "response_pred": "4 office chairs" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many mouses are in the scene?", "response_gt": [ "8", "8 mouses", "I see 8", "I see 8 mouses", "I can count 8 mouses" ], "response_pred": "2 mouses" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many kettles are in the scene?", "response_gt": [ "1", "1 kettles", "I see 1", "I see 1 kettles", "I can count 1 kettles" ], "response_pred": "2 kettles" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many telephones are in the scene?", "response_gt": [ "1", "1 telephones", "I see 1", "I see 1 telephones", "I can count 1 telephones" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many tripods are in the scene?", "response_gt": [ "1", "1 tripods", "I see 1", "I see 1 tripods", "I can count 1 tripods" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "2", "2 shelfs", "I see 2", "I see 2 shelfs", "I can count 2 shelfs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many machines are in the scene?", "response_gt": [ "2", "2 machines", "I see 2", "I see 2 machines", "I can count 2 machines" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many printers are in the scene?", "response_gt": [ "1", "1 printers", "I see 1", "I see 1 printers", "I can count 1 printers" ], "response_pred": "3 printers" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many mugs are in the scene?", "response_gt": [ "1", "1 mugs", "I see 1", "I see 1 mugs", "I can count 1 mugs" ], "response_pred": "2 mugs" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many jugs are in the scene?", "response_gt": [ "1", "1 jugs", "I see 1", "I see 1 jugs", "I can count 1 jugs" ], "response_pred": "4 jugs" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many bookss are in the scene?", "response_gt": [ "1", "1 bookss", "I see 1", "I see 1 bookss", "I can count 1 bookss" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "25", "25 ceiling lamps", "I see 25", "I see 25 ceiling lamps", "I can count 25 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many spotlights are in the scene?", "response_gt": [ "2", "2 spotlights", "I see 2", "I see 2 spotlights", "I can count 2 spotlights" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many table lamps are in the scene?", "response_gt": [ "1", "1 table lamps", "I see 1", "I see 1 table lamps", "I can count 1 table lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many oscilloscopes are in the scene?", "response_gt": [ "1", "1 oscilloscopes", "I see 1", "I see 1 oscilloscopes", "I can count 1 oscilloscopes" ], "response_pred": "2 oscilloscopes" }, { "source": "scannet", "scene_id": "578511c8a9", "instruction": " USER: How many lab equipments are in the scene?", "response_gt": [ "2", "2 lab equipments", "I see 2", "I see 2 lab equipments", "I can count 2 lab equipments" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many storage cabinets are in the scene?", "response_gt": [ "1", "1 storage cabinets", "I see 1", "I see 1 storage cabinets", "I can count 1 storage cabinets" ], "response_pred": "3 storage cabinets" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "5", "5 cabinets", "I see 5", "I see 5 cabinets", "I can count 5 cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many office chairs are in the scene?", "response_gt": [ "3", "3 office chairs", "I see 3", "I see 3 office chairs", "I can count 3 office chairs" ], "response_pred": "4 office chairs" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "3", "3 tables", "I see 3", "I see 3 tables", "I can count 3 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many air ducts are in the scene?", "response_gt": [ "2", "2 air ducts", "I see 2", "I see 2 air ducts", "I can count 2 air ducts" ], "response_pred": "2 air ducts" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many jackets are in the scene?", "response_gt": [ "2", "2 jackets", "I see 2", "I see 2 jackets", "I can count 2 jackets" ], "response_pred": "3 jackets" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "7", "7 boxs", "I see 7", "I see 7 boxs", "I can count 7 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many paper rams are in the scene?", "response_gt": [ "1", "1 paper rams", "I see 1", "I see 1 paper rams", "I can count 1 paper rams" ], "response_pred": "2 paper rams" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "8", "8 bottles", "I see 8", "I see 8 bottles", "I can count 8 bottles" ], "response_pred": "4 bottles" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many paper racks are in the scene?", "response_gt": [ "2", "2 paper racks", "I see 2", "I see 2 paper racks", "I can count 2 paper racks" ], "response_pred": "4 paper racks" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many paper staplers are in the scene?", "response_gt": [ "3", "3 paper staplers", "I see 3", "I see 3 paper staplers", "I can count 3 paper staplers" ], "response_pred": "2 paper staplers" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many calendars are in the scene?", "response_gt": [ "1", "1 calendars", "I see 1", "I see 1 calendars", "I can count 1 calendars" ], "response_pred": "2 calendars" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "2", "2 ceiling lamps", "I see 2", "I see 2 ceiling lamps", "I can count 2 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "3", "3 monitors", "I see 3", "I see 3 monitors", "I can count 3 monitors" ], "response_pred": "2 monitors" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many mouses are in the scene?", "response_gt": [ "3", "3 mouses", "I see 3", "I see 3 mouses", "I can count 3 mouses" ], "response_pred": "2 mouses" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many cups are in the scene?", "response_gt": [ "1", "1 cups", "I see 1", "I see 1 cups", "I can count 1 cups" ], "response_pred": "2 cups" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many folders are in the scene?", "response_gt": [ "2", "2 folders", "I see 2", "I see 2 folders", "I can count 2 folders" ], "response_pred": "2 folders" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many calculators are in the scene?", "response_gt": [ "1", "1 calculators", "I see 1", "I see 1 calculators", "I can count 1 calculators" ], "response_pred": "2 calculators" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many pencil standes are in the scene?", "response_gt": [ "1", "1 pencil standes", "I see 1", "I see 1 pencil standes", "I can count 1 pencil standes" ], "response_pred": "2 pencil stands" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "4", "4 objects", "I see 4", "I see 4 objects", "I can count 4 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "4", "4 trash bins", "I see 4", "I see 4 trash bins", "I can count 4 trash bins" ], "response_pred": "4 trash bins" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many bags are in the scene?", "response_gt": [ "1", "1 bags", "I see 1", "I see 1 bags", "I can count 1 bags" ], "response_pred": "3 bags" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many printers are in the scene?", "response_gt": [ "2", "2 printers", "I see 2", "I see 2 printers", "I can count 2 printers" ], "response_pred": "2 printers" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "1", "1 heaters", "I see 1", "I see 1 heaters", "I can count 1 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many shoess are in the scene?", "response_gt": [ "1", "1 shoess", "I see 1", "I see 1 shoess", "I can count 1 shoess" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many telephones are in the scene?", "response_gt": [ "1", "1 telephones", "I see 1", "I see 1 telephones", "I can count 1 telephones" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many photos are in the scene?", "response_gt": [ "1", "1 photos", "I see 1", "I see 1 photos", "I can count 1 photos" ], "response_pred": "3 photos" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many plant pots are in the scene?", "response_gt": [ "2", "2 plant pots", "I see 2", "I see 2 plant pots", "I can count 2 plant pots" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many plants are in the scene?", "response_gt": [ "3", "3 plants", "I see 3", "I see 3 plants", "I can count 3 plants" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "6", "6 windows", "I see 6", "I see 6 windows", "I can count 6 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "1", "1 whiteboards", "I see 1", "I see 1 whiteboards", "I can count 1 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "2", "2 keyboards", "I see 2", "I see 2 keyboards", "I can count 2 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many pen holders are in the scene?", "response_gt": [ "4", "4 pen holders", "I see 4", "I see 4 pen holders", "I can count 4 pen holders" ], "response_pred": "3 pen holders" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many laptops are in the scene?", "response_gt": [ "1", "1 laptops", "I see 1", "I see 1 laptops", "I can count 1 laptops" ], "response_pred": "2 laptops" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many paper towels are in the scene?", "response_gt": [ "1", "1 paper towels", "I see 1", "I see 1 paper towels", "I can count 1 paper towels" ], "response_pred": "4 paper towels" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many bookss are in the scene?", "response_gt": [ "2", "2 bookss", "I see 2", "I see 2 bookss", "I can count 2 bookss" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many books are in the scene?", "response_gt": [ "1", "1 books", "I see 1", "I see 1 books", "I can count 1 books" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "d755b3d9d8", "instruction": " USER: How many blindss are in the scene?", "response_gt": [ "5", "5 blindss", "I see 5", "I see 5 blindss", "I can count 5 blindss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many bar stools are in the scene?", "response_gt": [ "2", "2 bar stools", "I see 2", "I see 2 bar stools", "I can count 2 bar stools" ], "response_pred": "2 bar stools" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many stools are in the scene?", "response_gt": [ "1", "1 stools", "I see 1", "I see 1 stools", "I can count 1 stools" ], "response_pred": "2 stools" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many office chairs are in the scene?", "response_gt": [ "1", "1 office chairs", "I see 1", "I see 1 office chairs", "I can count 1 office chairs" ], "response_pred": "4 office chairs" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "74", "74 objects", "I see 74", "I see 74 objects", "I can count 74 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many milling machines are in the scene?", "response_gt": [ "2", "2 milling machines", "I see 2", "I see 2 milling machines", "I can count 2 milling machines" ], "response_pred": "2 milling machines" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many garbage bins are in the scene?", "response_gt": [ "2", "2 garbage bins", "I see 2", "I see 2 garbage bins", "I can count 2 garbage bins" ], "response_pred": "3 garbage bins" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many workbenchs are in the scene?", "response_gt": [ "1", "1 workbenchs", "I see 1", "I see 1 workbenchs", "I can count 1 workbenchs" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many paperss are in the scene?", "response_gt": [ "3", "3 paperss", "I see 3", "I see 3 paperss", "I can count 3 paperss" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many plastic cases are in the scene?", "response_gt": [ "15", "15 plastic cases", "I see 15", "I see 15 plastic cases", "I can count 15 plastic cases" ], "response_pred": "2 plastic cases" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many buckets are in the scene?", "response_gt": [ "4", "4 buckets", "I see 4", "I see 4 buckets", "I can count 4 buckets" ], "response_pred": "3 buckets" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many folding screens are in the scene?", "response_gt": [ "1", "1 folding screens", "I see 1", "I see 1 folding screens", "I can count 1 folding screens" ], "response_pred": "2 folding screens" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many wooden brushs are in the scene?", "response_gt": [ "3", "3 wooden brushs", "I see 3", "I see 3 wooden brushs", "I can count 3 wooden brushs" ], "response_pred": "2 wooden brushes" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many cloths are in the scene?", "response_gt": [ "4", "4 cloths", "I see 4", "I see 4 cloths", "I can count 4 cloths" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "4", "4 bottles", "I see 4", "I see 4 bottles", "I can count 4 bottles" ], "response_pred": "4 bottles" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many spray bottles are in the scene?", "response_gt": [ "2", "2 spray bottles", "I see 2", "I see 2 spray bottles", "I can count 2 spray bottles" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many cardboard boxs are in the scene?", "response_gt": [ "4", "4 cardboard boxs", "I see 4", "I see 4 cardboard boxs", "I can count 4 cardboard boxs" ], "response_pred": "4 cardboard boxes" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many books are in the scene?", "response_gt": [ "1", "1 books", "I see 1", "I see 1 books", "I can count 1 books" ], "response_pred": "3 books" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many notebooks are in the scene?", "response_gt": [ "1", "1 notebooks", "I see 1", "I see 1 notebooks", "I can count 1 notebooks" ], "response_pred": "3 notebooks" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many rolling stands are in the scene?", "response_gt": [ "1", "1 rolling stands", "I see 1", "I see 1 rolling stands", "I can count 1 rolling stands" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many plastic drums are in the scene?", "response_gt": [ "2", "2 plastic drums", "I see 2", "I see 2 plastic drums", "I can count 2 plastic drums" ], "response_pred": "3 plastic drums" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many workshop tools are in the scene?", "response_gt": [ "2", "2 workshop tools", "I see 2", "I see 2 workshop tools", "I can count 2 workshop tools" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many metal saws are in the scene?", "response_gt": [ "1", "1 metal saws", "I see 1", "I see 1 metal saws", "I can count 1 metal saws" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many hammers are in the scene?", "response_gt": [ "2", "2 hammers", "I see 2", "I see 2 hammers", "I can count 2 hammers" ], "response_pred": "2 hammers" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many cases are in the scene?", "response_gt": [ "1", "1 cases", "I see 1", "I see 1 cases", "I can count 1 cases" ], "response_pred": "3 cases" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many bookss are in the scene?", "response_gt": [ "1", "1 bookss", "I see 1", "I see 1 bookss", "I can count 1 bookss" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many folder holders are in the scene?", "response_gt": [ "1", "1 folder holders", "I see 1", "I see 1 folder holders", "I can count 1 folder holders" ], "response_pred": "2 folder holders" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many hammer holders are in the scene?", "response_gt": [ "1", "1 hammer holders", "I see 1", "I see 1 hammer holders", "I can count 1 hammer holders" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many boards are in the scene?", "response_gt": [ "1", "1 boards", "I see 1", "I see 1 boards", "I can count 1 boards" ], "response_pred": "4 boards" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many paper holders are in the scene?", "response_gt": [ "2", "2 paper holders", "I see 2", "I see 2 paper holders", "I can count 2 paper holders" ], "response_pred": "3 paper holders" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many splits are in the scene?", "response_gt": [ "1", "1 splits", "I see 1", "I see 1 splits", "I can count 1 splits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many screwdrivers are in the scene?", "response_gt": [ "2", "2 screwdrivers", "I see 2", "I see 2 screwdrivers", "I can count 2 screwdrivers" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "99fa5c25e1", "instruction": " USER: How many garbage bin covers are in the scene?", "response_gt": [ "1", "1 garbage bin covers", "I see 1", "I see 1 garbage bin covers", "I can count 1 garbage bin covers" ], "response_pred": "2 garbage bin covers" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "8", "8 cabinets", "I see 8", "I see 8 cabinets", "I can count 8 cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many sofas are in the scene?", "response_gt": [ "1", "1 sofas", "I see 1", "I see 1 sofas", "I can count 1 sofas" ], "response_pred": "2 sofas" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "7", "7 doors", "I see 7", "I see 7 doors", "I can count 7 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "5", "5 windows", "I see 5", "I see 5 windows", "I can count 5 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many bookshelfs are in the scene?", "response_gt": [ "2", "2 bookshelfs", "I see 2", "I see 2 bookshelfs", "I can count 2 bookshelfs" ], "response_pred": "3 bookshelves" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many pictures are in the scene?", "response_gt": [ "5", "5 pictures", "I see 5", "I see 5 pictures", "I can count 5 pictures" ], "response_pred": "2 pictures" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many counters are in the scene?", "response_gt": [ "1", "1 counters", "I see 1", "I see 1 counters", "I can count 1 counters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many refrigerators are in the scene?", "response_gt": [ "2", "2 refrigerators", "I see 2", "I see 2 refrigerators", "I can count 2 refrigerators" ], "response_pred": "2 refrigerators" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "3", "3 sinks", "I see 3", "I see 3 sinks", "I can count 3 sinks" ], "response_pred": "2 sinks" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many ceiling lights are in the scene?", "response_gt": [ "1", "1 ceiling lights", "I see 1", "I see 1 ceiling lights", "I can count 1 ceiling lights" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many switchs are in the scene?", "response_gt": [ "6", "6 switchs", "I see 6", "I see 6 switchs", "I can count 6 switchs" ], "response_pred": "1 switch" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "39", "39 objects", "I see 39", "I see 39 objects", "I can count 39 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "2", "2 shelfs", "I see 2", "I see 2 shelfs", "I can count 2 shelfs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many floor lamps are in the scene?", "response_gt": [ "2", "2 floor lamps", "I see 2", "I see 2 floor lamps", "I can count 2 floor lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many carpets are in the scene?", "response_gt": [ "4", "4 carpets", "I see 4", "I see 4 carpets", "I can count 4 carpets" ], "response_pred": "2 carpets" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many coffee tables are in the scene?", "response_gt": [ "1", "1 coffee tables", "I see 1", "I see 1 coffee tables", "I can count 1 coffee tables" ], "response_pred": "3 coffee tables" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many pillows are in the scene?", "response_gt": [ "2", "2 pillows", "I see 2", "I see 2 pillows", "I can count 2 pillows" ], "response_pred": "4 pillows" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many blankets are in the scene?", "response_gt": [ "3", "3 blankets", "I see 3", "I see 3 blankets", "I can count 3 blankets" ], "response_pred": "4 blankets" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many remote controllers are in the scene?", "response_gt": [ "1", "1 remote controllers", "I see 1", "I see 1 remote controllers", "I can count 1 remote controllers" ], "response_pred": "2 remotes" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many coasters are in the scene?", "response_gt": [ "1", "1 coasters", "I see 1", "I see 1 coasters", "I can count 1 coasters" ], "response_pred": "2 coasters" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many books are in the scene?", "response_gt": [ "12", "12 books", "I see 12", "I see 12 books", "I can count 12 books" ], "response_pred": "3 books" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "5", "5 boxs", "I see 5", "I see 5 boxs", "I can count 5 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many backpacks are in the scene?", "response_gt": [ "2", "2 backpacks", "I see 2", "I see 2 backpacks", "I can count 2 backpacks" ], "response_pred": "2 backpacks" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many speakers are in the scene?", "response_gt": [ "2", "2 speakers", "I see 2", "I see 2 speakers", "I can count 2 speakers" ], "response_pred": "2 speakers" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many tvs are in the scene?", "response_gt": [ "1", "1 tvs", "I see 1", "I see 1 tvs", "I can count 1 tvs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many sofa chairs are in the scene?", "response_gt": [ "1", "1 sofa chairs", "I see 1", "I see 1 sofa chairs", "I can count 1 sofa chairs" ], "response_pred": "2 sofa chairs" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many footstools are in the scene?", "response_gt": [ "1", "1 footstools", "I see 1", "I see 1 footstools", "I can count 1 footstools" ], "response_pred": "3 footstools" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many hats are in the scene?", "response_gt": [ "1", "1 hats", "I see 1", "I see 1 hats", "I can count 1 hats" ], "response_pred": "2 hats" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many plant pots are in the scene?", "response_gt": [ "5", "5 plant pots", "I see 5", "I see 5 plant pots", "I can count 5 plant pots" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many plants are in the scene?", "response_gt": [ "5", "5 plants", "I see 5", "I see 5 plants", "I can count 5 plants" ], "response_pred": "three" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many sticks are in the scene?", "response_gt": [ "1", "1 sticks", "I see 1", "I see 1 sticks", "I can count 1 sticks" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many dining tables are in the scene?", "response_gt": [ "1", "1 dining tables", "I see 1", "I see 1 dining tables", "I can count 1 dining tables" ], "response_pred": "2 dining tables" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many dining chairs are in the scene?", "response_gt": [ "5", "5 dining chairs", "I see 5", "I see 5 dining chairs", "I can count 5 dining chairs" ], "response_pred": "2 dining chairs" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many curtains are in the scene?", "response_gt": [ "2", "2 curtains", "I see 2", "I see 2 curtains", "I can count 2 curtains" ], "response_pred": "2 curtains" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many shoes are in the scene?", "response_gt": [ "8", "8 shoes", "I see 8", "I see 8 shoes", "I can count 8 shoes" ], "response_pred": "3 shoes" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many suitcases are in the scene?", "response_gt": [ "1", "1 suitcases", "I see 1", "I see 1 suitcases", "I can count 1 suitcases" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many bags are in the scene?", "response_gt": [ "2", "2 bags", "I see 2", "I see 2 bags", "I can count 2 bags" ], "response_pred": "2 bags" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many flip flops are in the scene?", "response_gt": [ "2", "2 flip flops", "I see 2", "I see 2 flip flops", "I can count 2 flip flops" ], "response_pred": "2 flip flops" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many boots are in the scene?", "response_gt": [ "2", "2 boots", "I see 2", "I see 2 boots", "I can count 2 boots" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many shoe racks are in the scene?", "response_gt": [ "1", "1 shoe racks", "I see 1", "I see 1 shoe racks", "I can count 1 shoe racks" ], "response_pred": "2 shoe racks" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many wall lamps are in the scene?", "response_gt": [ "1", "1 wall lamps", "I see 1", "I see 1 wall lamps", "I can count 1 wall lamps" ], "response_pred": "2 wall lamps" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many papers are in the scene?", "response_gt": [ "2", "2 papers", "I see 2", "I see 2 papers", "I can count 2 papers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many slippers are in the scene?", "response_gt": [ "3", "3 slippers", "I see 3", "I see 3 slippers", "I can count 3 slippers" ], "response_pred": "2 slippers" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many led ceiling fans are in the scene?", "response_gt": [ "1", "1 led ceiling fans", "I see 1", "I see 1 led ceiling fans", "I can count 1 led ceiling fans" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many bicycles are in the scene?", "response_gt": [ "3", "3 bicycles", "I see 3", "I see 3 bicycles", "I can count 3 bicycles" ], "response_pred": "2 bicycles" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many pumpers are in the scene?", "response_gt": [ "1", "1 pumpers", "I see 1", "I see 1 pumpers", "I can count 1 pumpers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many helmets are in the scene?", "response_gt": [ "2", "2 helmets", "I see 2", "I see 2 helmets", "I can count 2 helmets" ], "response_pred": "2 helmets" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "7", "7 bottles", "I see 7", "I see 7 bottles", "I can count 7 bottles" ], "response_pred": "6 bottles" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many surfboards are in the scene?", "response_gt": [ "1", "1 surfboards", "I see 1", "I see 1 surfboards", "I can count 1 surfboards" ], "response_pred": "2 surfboards" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many structures are in the scene?", "response_gt": [ "2", "2 structures", "I see 2", "I see 2 structures", "I can count 2 structures" ], "response_pred": "3 structures" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many spotlights are in the scene?", "response_gt": [ "2", "2 spotlights", "I see 2", "I see 2 spotlights", "I can count 2 spotlights" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many skyligths are in the scene?", "response_gt": [ "1", "1 skyligths", "I see 1", "I see 1 skyligths", "I can count 1 skyligths" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many kitchen cabinets are in the scene?", "response_gt": [ "8", "8 kitchen cabinets", "I see 8", "I see 8 kitchen cabinets", "I can count 8 kitchen cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many ovens are in the scene?", "response_gt": [ "1", "1 ovens", "I see 1", "I see 1 ovens", "I can count 1 ovens" ], "response_pred": "2 ovens" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many towels are in the scene?", "response_gt": [ "3", "3 towels", "I see 3", "I see 3 towels", "I can count 3 towels" ], "response_pred": "2 towels" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many kitchen counters are in the scene?", "response_gt": [ "3", "3 kitchen counters", "I see 3", "I see 3 kitchen counters", "I can count 3 kitchen counters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many stools are in the scene?", "response_gt": [ "2", "2 stools", "I see 2", "I see 2 stools", "I can count 2 stools" ], "response_pred": "2 stools" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many plates are in the scene?", "response_gt": [ "2", "2 plates", "I see 2", "I see 2 plates", "I can count 2 plates" ], "response_pred": "2 plates" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many apples are in the scene?", "response_gt": [ "1", "1 apples", "I see 1", "I see 1 apples", "I can count 1 apples" ], "response_pred": "2 apples" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many bananass are in the scene?", "response_gt": [ "1", "1 bananass", "I see 1", "I see 1 bananass", "I can count 1 bananass" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "1", "1 trash cans", "I see 1", "I see 1 trash cans", "I can count 1 trash cans" ], "response_pred": "4 trash cans" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many cardboards are in the scene?", "response_gt": [ "3", "3 cardboards", "I see 3", "I see 3 cardboards", "I can count 3 cardboards" ], "response_pred": "4 cardboards" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many sponges are in the scene?", "response_gt": [ "2", "2 sponges", "I see 2", "I see 2 sponges", "I can count 2 sponges" ], "response_pred": "2 sponges" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many taps are in the scene?", "response_gt": [ "2", "2 taps", "I see 2", "I see 2 taps", "I can count 2 taps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many kitchen gloves are in the scene?", "response_gt": [ "1", "1 kitchen gloves", "I see 1", "I see 1 kitchen gloves", "I can count 1 kitchen gloves" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many kitchen hoods are in the scene?", "response_gt": [ "1", "1 kitchen hoods", "I see 1", "I see 1 kitchen hoods", "I can count 1 kitchen hoods" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many paper towels are in the scene?", "response_gt": [ "2", "2 paper towels", "I see 2", "I see 2 paper towels", "I can count 2 paper towels" ], "response_pred": "4 paper towels" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many cutting boards are in the scene?", "response_gt": [ "1", "1 cutting boards", "I see 1", "I see 1 cutting boards", "I can count 1 cutting boards" ], "response_pred": "2 cutting boards" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many kitchen utensilss are in the scene?", "response_gt": [ "1", "1 kitchen utensilss", "I see 1", "I see 1 kitchen utensilss", "I can count 1 kitchen utensilss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many stoves are in the scene?", "response_gt": [ "1", "1 stoves", "I see 1", "I see 1 stoves", "I can count 1 stoves" ], "response_pred": "2 stoves" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many jars are in the scene?", "response_gt": [ "13", "13 jars", "I see 13", "I see 13 jars", "I can count 13 jars" ], "response_pred": "6 jars" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many knife holders are in the scene?", "response_gt": [ "1", "1 knife holders", "I see 1", "I see 1 knife holders", "I can count 1 knife holders" ], "response_pred": "2 knife holders" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many scissors are in the scene?", "response_gt": [ "1", "1 scissors", "I see 1", "I see 1 scissors", "I can count 1 scissors" ], "response_pred": "2 scissors" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many knifes are in the scene?", "response_gt": [ "5", "5 knifes", "I see 5", "I see 5 knifes", "I can count 5 knifes" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many kettles are in the scene?", "response_gt": [ "1", "1 kettles", "I see 1", "I see 1 kettles", "I can count 1 kettles" ], "response_pred": "2 kettles" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many machines are in the scene?", "response_gt": [ "1", "1 machines", "I see 1", "I see 1 machines", "I can count 1 machines" ], "response_pred": "2 machines" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many cleaning brushs are in the scene?", "response_gt": [ "1", "1 cleaning brushs", "I see 1", "I see 1 cleaning brushs", "I can count 1 cleaning brushs" ], "response_pred": "2 cleaning brushes" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many microwaves are in the scene?", "response_gt": [ "1", "1 microwaves", "I see 1", "I see 1 microwaves", "I can count 1 microwaves" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many toasters are in the scene?", "response_gt": [ "1", "1 toasters", "I see 1", "I see 1 toasters", "I can count 1 toasters" ], "response_pred": "2 toasters" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many blindss are in the scene?", "response_gt": [ "1", "1 blindss", "I see 1", "I see 1 blindss", "I can count 1 blindss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many trazs are in the scene?", "response_gt": [ "1", "1 trazs", "I see 1", "I see 1 trazs", "I can count 1 trazs" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "09c1414f1b", "instruction": " USER: How many splits are in the scene?", "response_gt": [ "1", "1 splits", "I see 1", "I see 1 splits", "I can count 1 splits" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "2", "2 doors", "I see 2", "I see 2 doors", "I can count 2 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "1", "1 objects", "I see 1", "I see 1 objects", "I can count 1 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many water kettles are in the scene?", "response_gt": [ "3", "3 water kettles", "I see 3", "I see 3 water kettles", "I can count 3 water kettles" ], "response_pred": "2 water kettles" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "2", "2 trash bins", "I see 2", "I see 2 trash bins", "I can count 2 trash bins" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "14", "14 boxs", "I see 14", "I see 14 boxs", "I can count 14 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "11", "11 tables", "I see 11", "I see 11 tables", "I can count 11 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "1", "1 shelfs", "I see 1", "I see 1 shelfs", "I can count 1 shelfs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many printers are in the scene?", "response_gt": [ "2", "2 printers", "I see 2", "I see 2 printers", "I can count 2 printers" ], "response_pred": "3 printers" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "2", "2 monitors", "I see 2", "I see 2 monitors", "I can count 2 monitors" ], "response_pred": "3 monitors" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many copiers are in the scene?", "response_gt": [ "1", "1 copiers", "I see 1", "I see 1 copiers", "I can count 1 copiers" ], "response_pred": "2 copiers" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many suitcases are in the scene?", "response_gt": [ "1", "1 suitcases", "I see 1", "I see 1 suitcases", "I can count 1 suitcases" ], "response_pred": "2 suitcases" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "1", "1 windows", "I see 1", "I see 1 windows", "I can count 1 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many dish drainers are in the scene?", "response_gt": [ "1", "1 dish drainers", "I see 1", "I see 1 dish drainers", "I can count 1 dish drainers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "14", "14 chairs", "I see 14", "I see 14 chairs", "I can count 14 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many arm chairs are in the scene?", "response_gt": [ "8", "8 arm chairs", "I see 8", "I see 8 arm chairs", "I can count 8 arm chairs" ], "response_pred": "4 arm chairs" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "3", "3 whiteboards", "I see 3", "I see 3 whiteboards", "I can count 3 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "1", "1 smoke detectors", "I see 1", "I see 1 smoke detectors", "I can count 1 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many kitchen counters are in the scene?", "response_gt": [ "1", "1 kitchen counters", "I see 1", "I see 1 kitchen counters", "I can count 1 kitchen counters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "5", "5 ceiling lamps", "I see 5", "I see 5 ceiling lamps", "I can count 5 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many plant pots are in the scene?", "response_gt": [ "1", "1 plant pots", "I see 1", "I see 1 plant pots", "I can count 1 plant pots" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many switchs are in the scene?", "response_gt": [ "1", "1 switchs", "I see 1", "I see 1 switchs", "I can count 1 switchs" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many coffee makers are in the scene?", "response_gt": [ "2", "2 coffee makers", "I see 2", "I see 2 coffee makers", "I can count 2 coffee makers" ], "response_pred": "2 coffee makers" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many plates are in the scene?", "response_gt": [ "2", "2 plates", "I see 2", "I see 2 plates", "I can count 2 plates" ], "response_pred": "2 plates" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many kitchen cabinets are in the scene?", "response_gt": [ "2", "2 kitchen cabinets", "I see 2", "I see 2 kitchen cabinets", "I can count 2 kitchen cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many bags are in the scene?", "response_gt": [ "1", "1 bags", "I see 1", "I see 1 bags", "I can count 1 bags" ], "response_pred": "3 bags" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "1", "1 cabinets", "I see 1", "I see 1 cabinets", "I can count 1 cabinets" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "5f99900f09", "instruction": " USER: How many refrigerators are in the scene?", "response_gt": [ "1", "1 refrigerators", "I see 1", "I see 1 refrigerators", "I can count 1 refrigerators" ], "response_pred": "2 refrigerators" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many papers are in the scene?", "response_gt": [ "32", "32 papers", "I see 32", "I see 32 papers", "I can count 32 papers" ], "response_pred": "3 papers" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "19", "19 objects", "I see 19", "I see 19 objects", "I can count 19 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many bookss are in the scene?", "response_gt": [ "1", "1 bookss", "I see 1", "I see 1 bookss", "I can count 1 bookss" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many packages are in the scene?", "response_gt": [ "2", "2 packages", "I see 2", "I see 2 packages", "I can count 2 packages" ], "response_pred": "3 packages" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "1", "1 shelfs", "I see 1", "I see 1 shelfs", "I can count 1 shelfs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many posters are in the scene?", "response_gt": [ "2", "2 posters", "I see 2", "I see 2 posters", "I can count 2 posters" ], "response_pred": "4 posters" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many mugs are in the scene?", "response_gt": [ "18", "18 mugs", "I see 18", "I see 18 mugs", "I can count 18 mugs" ], "response_pred": "2 mugs" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "5", "5 chairs", "I see 5", "I see 5 chairs", "I can count 5 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many ottoman chairs are in the scene?", "response_gt": [ "2", "2 ottoman chairs", "I see 2", "I see 2 ottoman chairs", "I can count 2 ottoman chairs" ], "response_pred": "2 ottoman chairs" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "5", "5 tables", "I see 5", "I see 5 tables", "I can count 5 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many sofas are in the scene?", "response_gt": [ "5", "5 sofas", "I see 5", "I see 5 sofas", "I can count 5 sofas" ], "response_pred": "2 sofas" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "1", "1 cabinets", "I see 1", "I see 1 cabinets", "I can count 1 cabinets" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many false ceilings are in the scene?", "response_gt": [ "1", "1 false ceilings", "I see 1", "I see 1 false ceilings", "I can count 1 false ceilings" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many whiteboard erasers are in the scene?", "response_gt": [ "1", "1 whiteboard erasers", "I see 1", "I see 1 whiteboard erasers", "I can count 1 whiteboard erasers" ], "response_pred": "2 whiteboard erasers" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many wall boards are in the scene?", "response_gt": [ "3", "3 wall boards", "I see 3", "I see 3 wall boards", "I can count 3 wall boards" ], "response_pred": "4 wall boards" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many clocks are in the scene?", "response_gt": [ "1", "1 clocks", "I see 1", "I see 1 clocks", "I can count 1 clocks" ], "response_pred": "2 clocks" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many whiteboards are in the scene?", "response_gt": [ "1", "1 whiteboards", "I see 1", "I see 1 whiteboards", "I can count 1 whiteboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many stationerys are in the scene?", "response_gt": [ "1", "1 stationerys", "I see 1", "I see 1 stationerys", "I can count 1 stationerys" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "3", "3 windows", "I see 3", "I see 3 windows", "I can count 3 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many books are in the scene?", "response_gt": [ "21", "21 books", "I see 21", "I see 21 books", "I can count 21 books" ], "response_pred": "3 books" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many chessboards are in the scene?", "response_gt": [ "1", "1 chessboards", "I see 1", "I see 1 chessboards", "I can count 1 chessboards" ], "response_pred": "3 chessboards" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many water kettles are in the scene?", "response_gt": [ "3", "3 water kettles", "I see 3", "I see 3 water kettles", "I can count 3 water kettles" ], "response_pred": "2 water kettles" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many bins are in the scene?", "response_gt": [ "2", "2 bins", "I see 2", "I see 2 bins", "I can count 2 bins" ], "response_pred": "4 bins" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "2", "2 smoke detectors", "I see 2", "I see 2 smoke detectors", "I can count 2 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many ceiling vents are in the scene?", "response_gt": [ "5", "5 ceiling vents", "I see 5", "I see 5 ceiling vents", "I can count 5 ceiling vents" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many ceiling lights are in the scene?", "response_gt": [ "7", "7 ceiling lights", "I see 7", "I see 7 ceiling lights", "I can count 7 ceiling lights" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many lights are in the scene?", "response_gt": [ "3", "3 lights", "I see 3", "I see 3 lights", "I can count 3 lights" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many structures are in the scene?", "response_gt": [ "1", "1 structures", "I see 1", "I see 1 structures", "I can count 1 structures" ], "response_pred": "3 structures" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many cushions are in the scene?", "response_gt": [ "6", "6 cushions", "I see 6", "I see 6 cushions", "I can count 6 cushions" ], "response_pred": "4 cushions" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many door stoppers are in the scene?", "response_gt": [ "1", "1 door stoppers", "I see 1", "I see 1 door stoppers", "I can count 1 door stoppers" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many ceiling speakers are in the scene?", "response_gt": [ "1", "1 ceiling speakers", "I see 1", "I see 1 ceiling speakers", "I can count 1 ceiling speakers" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "9071e139d9", "instruction": " USER: How many ventilators are in the scene?", "response_gt": [ "1", "1 ventilators", "I see 1", "I see 1 ventilators", "I can count 1 ventilators" ], "response_pred": "2 ventilators" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many pictures are in the scene?", "response_gt": [ "1", "1 pictures", "I see 1", "I see 1 pictures", "I can count 1 pictures" ], "response_pred": "2 pictures" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "1", "1 boxs", "I see 1", "I see 1 boxs", "I can count 1 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many bathroom shelfs are in the scene?", "response_gt": [ "1", "1 bathroom shelfs", "I see 1", "I see 1 bathroom shelfs", "I can count 1 bathroom shelfs" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many bedside counters are in the scene?", "response_gt": [ "1", "1 bedside counters", "I see 1", "I see 1 bedside counters", "I can count 1 bedside counters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many sliding doors are in the scene?", "response_gt": [ "1", "1 sliding doors", "I see 1", "I see 1 sliding doors", "I can count 1 sliding doors" ], "response_pred": "2 sliding doors" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many ottoman chairs are in the scene?", "response_gt": [ "1", "1 ottoman chairs", "I see 1", "I see 1 ottoman chairs", "I can count 1 ottoman chairs" ], "response_pred": "2 ottoman chairs" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "1", "1 chairs", "I see 1", "I see 1 chairs", "I can count 1 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many sofas are in the scene?", "response_gt": [ "1", "1 sofas", "I see 1", "I see 1 sofas", "I can count 1 sofas" ], "response_pred": "2 sofas" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "2", "2 doors", "I see 2", "I see 2 doors", "I can count 2 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many tvs are in the scene?", "response_gt": [ "1", "1 tvs", "I see 1", "I see 1 tvs", "I can count 1 tvs" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many telephones are in the scene?", "response_gt": [ "2", "2 telephones", "I see 2", "I see 2 telephones", "I can count 2 telephones" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many curtains are in the scene?", "response_gt": [ "1", "1 curtains", "I see 1", "I see 1 curtains", "I can count 1 curtains" ], "response_pred": "2 curtains" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many toilets are in the scene?", "response_gt": [ "1", "1 toilets", "I see 1", "I see 1 toilets", "I can count 1 toilets" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "1", "1 sinks", "I see 1", "I see 1 sinks", "I can count 1 sinks" ], "response_pred": "2 sinks" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many vents are in the scene?", "response_gt": [ "1", "1 vents", "I see 1", "I see 1 vents", "I can count 1 vents" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many mirrors are in the scene?", "response_gt": [ "1", "1 mirrors", "I see 1", "I see 1 mirrors", "I can count 1 mirrors" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "1", "1 cabinets", "I see 1", "I see 1 cabinets", "I can count 1 cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many wall units are in the scene?", "response_gt": [ "1", "1 wall units", "I see 1", "I see 1 wall units", "I can count 1 wall units" ], "response_pred": "2 wall units" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many bath cabinets are in the scene?", "response_gt": [ "1", "1 bath cabinets", "I see 1", "I see 1 bath cabinets", "I can count 1 bath cabinets" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many wardrobes are in the scene?", "response_gt": [ "1", "1 wardrobes", "I see 1", "I see 1 wardrobes", "I can count 1 wardrobes" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many counters are in the scene?", "response_gt": [ "1", "1 counters", "I see 1", "I see 1 counters", "I can count 1 counters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "1", "1 tables", "I see 1", "I see 1 tables", "I can count 1 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many table lamps are in the scene?", "response_gt": [ "1", "1 table lamps", "I see 1", "I see 1 table lamps", "I can count 1 table lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many ventilations are in the scene?", "response_gt": [ "1", "1 ventilations", "I see 1", "I see 1 ventilations", "I can count 1 ventilations" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many pillows are in the scene?", "response_gt": [ "5", "5 pillows", "I see 5", "I see 5 pillows", "I can count 5 pillows" ], "response_pred": "4 pillows" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many headboards are in the scene?", "response_gt": [ "1", "1 headboards", "I see 1", "I see 1 headboards", "I can count 1 headboards" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many bedside lamps are in the scene?", "response_gt": [ "2", "2 bedside lamps", "I see 2", "I see 2 bedside lamps", "I can count 2 bedside lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many sliding door frames are in the scene?", "response_gt": [ "1", "1 sliding door frames", "I see 1", "I see 1 sliding door frames", "I can count 1 sliding door frames" ], "response_pred": "2 sliding door frames" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many trash bins are in the scene?", "response_gt": [ "1", "1 trash bins", "I see 1", "I see 1 trash bins", "I can count 1 trash bins" ], "response_pred": "4 trash bins" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many standing lamps are in the scene?", "response_gt": [ "1", "1 standing lamps", "I see 1", "I see 1 standing lamps", "I can count 1 standing lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many beds are in the scene?", "response_gt": [ "1", "1 beds", "I see 1", "I see 1 beds", "I can count 1 beds" ], "response_pred": "2 beds" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many blankets are in the scene?", "response_gt": [ "1", "1 blankets", "I see 1", "I see 1 blankets", "I can count 1 blankets" ], "response_pred": "4 blankets" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many towels are in the scene?", "response_gt": [ "1", "1 towels", "I see 1", "I see 1 towels", "I can count 1 towels" ], "response_pred": "2 towels" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many shower walls are in the scene?", "response_gt": [ "3", "3 shower walls", "I see 3", "I see 3 shower walls", "I can count 3 shower walls" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many shower seatings are in the scene?", "response_gt": [ "1", "1 shower seatings", "I see 1", "I see 1 shower seatings", "I can count 1 shower seatings" ], "response_pred": "2 shower seatings" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "2", "2 smoke detectors", "I see 2", "I see 2 smoke detectors", "I can count 2 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many lamps are in the scene?", "response_gt": [ "1", "1 lamps", "I see 1", "I see 1 lamps", "I can count 1 lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many shower floors are in the scene?", "response_gt": [ "1", "1 shower floors", "I see 1", "I see 1 shower floors", "I can count 1 shower floors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many toilet papers are in the scene?", "response_gt": [ "2", "2 toilet papers", "I see 2", "I see 2 toilet papers", "I can count 2 toilet papers" ], "response_pred": "2 toilet papers" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many coffee makers are in the scene?", "response_gt": [ "1", "1 coffee makers", "I see 1", "I see 1 coffee makers", "I can count 1 coffee makers" ], "response_pred": "2 coffee makers" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many bedside tables are in the scene?", "response_gt": [ "1", "1 bedside tables", "I see 1", "I see 1 bedside tables", "I can count 1 bedside tables" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many ceiling lights are in the scene?", "response_gt": [ "5", "5 ceiling lights", "I see 5", "I see 5 ceiling lights", "I can count 5 ceiling lights" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many ceiling vents are in the scene?", "response_gt": [ "1", "1 ceiling vents", "I see 1", "I see 1 ceiling vents", "I can count 1 ceiling vents" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "1", "1 objects", "I see 1", "I see 1 objects", "I can count 1 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "6115eddb86", "instruction": " USER: How many tv remotes are in the scene?", "response_gt": [ "1", "1 tv remotes", "I see 1", "I see 1 tv remotes", "I can count 1 tv remotes" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "5", "5 doors", "I see 5", "I see 5 doors", "I can count 5 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many cabinets are in the scene?", "response_gt": [ "2", "2 cabinets", "I see 2", "I see 2 cabinets", "I can count 2 cabinets" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many wardrobes are in the scene?", "response_gt": [ "1", "1 wardrobes", "I see 1", "I see 1 wardrobes", "I can count 1 wardrobes" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many kitchen cabinets are in the scene?", "response_gt": [ "2", "2 kitchen cabinets", "I see 2", "I see 2 kitchen cabinets", "I can count 2 kitchen cabinets" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many curtains are in the scene?", "response_gt": [ "2", "2 curtains", "I see 2", "I see 2 curtains", "I can count 2 curtains" ], "response_pred": "2 curtains" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many sinks are in the scene?", "response_gt": [ "1", "1 sinks", "I see 1", "I see 1 sinks", "I can count 1 sinks" ], "response_pred": "2 sinks" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many counters are in the scene?", "response_gt": [ "1", "1 counters", "I see 1", "I see 1 counters", "I can count 1 counters" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many tvs are in the scene?", "response_gt": [ "1", "1 tvs", "I see 1", "I see 1 tvs", "I can count 1 tvs" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many bar stools are in the scene?", "response_gt": [ "2", "2 bar stools", "I see 2", "I see 2 bar stools", "I can count 2 bar stools" ], "response_pred": "2 bar stools" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many chairs are in the scene?", "response_gt": [ "2", "2 chairs", "I see 2", "I see 2 chairs", "I can count 2 chairs" ], "response_pred": "3 chairs" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many fridges are in the scene?", "response_gt": [ "1", "1 fridges", "I see 1", "I see 1 fridges", "I can count 1 fridges" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many beds are in the scene?", "response_gt": [ "1", "1 beds", "I see 1", "I see 1 beds", "I can count 1 beds" ], "response_pred": "2 beds" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many clothes racks are in the scene?", "response_gt": [ "1", "1 clothes racks", "I see 1", "I see 1 clothes racks", "I can count 1 clothes racks" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many doormats are in the scene?", "response_gt": [ "1", "1 doormats", "I see 1", "I see 1 doormats", "I can count 1 doormats" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many door mats are in the scene?", "response_gt": [ "1", "1 door mats", "I see 1", "I see 1 door mats", "I can count 1 door mats" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many pictures are in the scene?", "response_gt": [ "6", "6 pictures", "I see 6", "I see 6 pictures", "I can count 6 pictures" ], "response_pred": "2 pictures" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many telephones are in the scene?", "response_gt": [ "1", "1 telephones", "I see 1", "I see 1 telephones", "I can count 1 telephones" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many washing machines are in the scene?", "response_gt": [ "1", "1 washing machines", "I see 1", "I see 1 washing machines", "I can count 1 washing machines" ], "response_pred": "2 washing machines" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many clothes dryers are in the scene?", "response_gt": [ "2", "2 clothes dryers", "I see 2", "I see 2 clothes dryers", "I can count 2 clothes dryers" ], "response_pred": "2 clothes dryers" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many range hoods are in the scene?", "response_gt": [ "1", "1 range hoods", "I see 1", "I see 1 range hoods", "I can count 1 range hoods" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many crates are in the scene?", "response_gt": [ "1", "1 crates", "I see 1", "I see 1 crates", "I can count 1 crates" ], "response_pred": "4" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many bags are in the scene?", "response_gt": [ "3", "3 bags", "I see 3", "I see 3 bags", "I can count 3 bags" ], "response_pred": "2 bags" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many computer desks are in the scene?", "response_gt": [ "1", "1 computer desks", "I see 1", "I see 1 computer desks", "I can count 1 computer desks" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "1", "1 monitors", "I see 1", "I see 1 monitors", "I can count 1 monitors" ], "response_pred": "2 monitors" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many laptops are in the scene?", "response_gt": [ "1", "1 laptops", "I see 1", "I see 1 laptops", "I can count 1 laptops" ], "response_pred": "2 laptops" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many tissue boxs are in the scene?", "response_gt": [ "2", "2 tissue boxs", "I see 2", "I see 2 tissue boxs", "I can count 2 tissue boxs" ], "response_pred": "2 tissue box" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "3", "3 bottles", "I see 3", "I see 3 bottles", "I can count 3 bottles" ], "response_pred": "6 bottles" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many mugs are in the scene?", "response_gt": [ "3", "3 mugs", "I see 3", "I see 3 mugs", "I can count 3 mugs" ], "response_pred": "2 mugs" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many smartphones are in the scene?", "response_gt": [ "1", "1 smartphones", "I see 1", "I see 1 smartphones", "I can count 1 smartphones" ], "response_pred": "2 smartphones" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "28", "28 objects", "I see 28", "I see 28 objects", "I can count 28 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many plastic bottles are in the scene?", "response_gt": [ "3", "3 plastic bottles", "I see 3", "I see 3 plastic bottles", "I can count 3 plastic bottles" ], "response_pred": "4 plastic bottles" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many hair dryers are in the scene?", "response_gt": [ "1", "1 hair dryers", "I see 1", "I see 1 hair dryers", "I can count 1 hair dryers" ], "response_pred": "2 hair dryers" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many yoga mats are in the scene?", "response_gt": [ "1", "1 yoga mats", "I see 1", "I see 1 yoga mats", "I can count 1 yoga mats" ], "response_pred": "2 yoga mats" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many cardboards are in the scene?", "response_gt": [ "4", "4 cardboards", "I see 4", "I see 4 cardboards", "I can count 4 cardboards" ], "response_pred": "4 cardboards" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many towels are in the scene?", "response_gt": [ "1", "1 towels", "I see 1", "I see 1 towels", "I can count 1 towels" ], "response_pred": "2 towels" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many shoes are in the scene?", "response_gt": [ "4", "4 shoes", "I see 4", "I see 4 shoes", "I can count 4 shoes" ], "response_pred": "3 shoes" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many detergent bottles are in the scene?", "response_gt": [ "2", "2 detergent bottles", "I see 2", "I see 2 detergent bottles", "I can count 2 detergent bottles" ], "response_pred": "2 detergent bottles" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many suitcases are in the scene?", "response_gt": [ "3", "3 suitcases", "I see 3", "I see 3 suitcases", "I can count 3 suitcases" ], "response_pred": "2 suitcases" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many backpacks are in the scene?", "response_gt": [ "4", "4 backpacks", "I see 4", "I see 4 backpacks", "I can count 4 backpacks" ], "response_pred": "2 backpacks" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "1", "1 shelfs", "I see 1", "I see 1 shelfs", "I can count 1 shelfs" ], "response_pred": "There are 5 shelves in the scene." }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many paper towels are in the scene?", "response_gt": [ "1", "1 paper towels", "I see 1", "I see 1 paper towels", "I can count 1 paper towels" ], "response_pred": "4 paper towels" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many paper towel packages are in the scene?", "response_gt": [ "1", "1 paper towel packages", "I see 1", "I see 1 paper towel packages", "I can count 1 paper towel packages" ], "response_pred": "2 paper towel packages" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "2", "2 trash cans", "I see 2", "I see 2 trash cans", "I can count 2 trash cans" ], "response_pred": "4 trash cans" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many laundry baskets are in the scene?", "response_gt": [ "1", "1 laundry baskets", "I see 1", "I see 1 laundry baskets", "I can count 1 laundry baskets" ], "response_pred": "4 laundry baskets" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many fire extinguishers are in the scene?", "response_gt": [ "1", "1 fire extinguishers", "I see 1", "I see 1 fire extinguishers", "I can count 1 fire extinguishers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many delivery bags are in the scene?", "response_gt": [ "1", "1 delivery bags", "I see 1", "I see 1 delivery bags", "I can count 1 delivery bags" ], "response_pred": "2 delivery bags" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many tote bags are in the scene?", "response_gt": [ "1", "1 tote bags", "I see 1", "I see 1 tote bags", "I can count 1 tote bags" ], "response_pred": "2 tote bags" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many ovens are in the scene?", "response_gt": [ "1", "1 ovens", "I see 1", "I see 1 ovens", "I can count 1 ovens" ], "response_pred": "2 ovens" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many microwaves are in the scene?", "response_gt": [ "1", "1 microwaves", "I see 1", "I see 1 microwaves", "I can count 1 microwaves" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many wok pans are in the scene?", "response_gt": [ "1", "1 wok pans", "I see 1", "I see 1 wok pans", "I can count 1 wok pans" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many dishwashing sponges are in the scene?", "response_gt": [ "1", "1 dishwashing sponges", "I see 1", "I see 1 dishwashing sponges", "I can count 1 dishwashing sponges" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many kitchen brushs are in the scene?", "response_gt": [ "1", "1 kitchen brushs", "I see 1", "I see 1 kitchen brushs", "I can count 1 kitchen brushs" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many pans are in the scene?", "response_gt": [ "1", "1 pans", "I see 1", "I see 1 pans", "I can count 1 pans" ], "response_pred": "4 pans" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many cardboard boxs are in the scene?", "response_gt": [ "1", "1 cardboard boxs", "I see 1", "I see 1 cardboard boxs", "I can count 1 cardboard boxs" ], "response_pred": "2 cardboard boxes" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many cloths are in the scene?", "response_gt": [ "3", "3 cloths", "I see 3", "I see 3 cloths", "I can count 3 cloths" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many coats are in the scene?", "response_gt": [ "4", "4 coats", "I see 4", "I see 4 coats", "I can count 4 coats" ], "response_pred": "3 coats" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many cloth hangers are in the scene?", "response_gt": [ "1", "1 cloth hangers", "I see 1", "I see 1 cloth hangers", "I can count 1 cloth hangers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many sofas are in the scene?", "response_gt": [ "1", "1 sofas", "I see 1", "I see 1 sofas", "I can count 1 sofas" ], "response_pred": "2 sofas" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many trouserss are in the scene?", "response_gt": [ "1", "1 trouserss", "I see 1", "I see 1 trouserss", "I can count 1 trouserss" ], "response_pred": "2 trouserss" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many clothess are in the scene?", "response_gt": [ "1", "1 clothess", "I see 1", "I see 1 clothess", "I can count 1 clothess" ], "response_pred": "2 clothess" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many light fixtures are in the scene?", "response_gt": [ "3", "3 light fixtures", "I see 3", "I see 3 light fixtures", "I can count 3 light fixtures" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many faucets are in the scene?", "response_gt": [ "1", "1 faucets", "I see 1", "I see 1 faucets", "I can count 1 faucets" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many plates are in the scene?", "response_gt": [ "2", "2 plates", "I see 2", "I see 2 plates", "I can count 2 plates" ], "response_pred": "2 plates" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many trays are in the scene?", "response_gt": [ "1", "1 trays", "I see 1", "I see 1 trays", "I can count 1 trays" ], "response_pred": "3 trays" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many cereal boxs are in the scene?", "response_gt": [ "1", "1 cereal boxs", "I see 1", "I see 1 cereal boxs", "I can count 1 cereal boxs" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many flour bags are in the scene?", "response_gt": [ "1", "1 flour bags", "I see 1", "I see 1 flour bags", "I can count 1 flour bags" ], "response_pred": "2 flour bags" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many moka pots are in the scene?", "response_gt": [ "1", "1 moka pots", "I see 1", "I see 1 moka pots", "I can count 1 moka pots" ], "response_pred": "2 moka pots" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many cooking pots are in the scene?", "response_gt": [ "2", "2 cooking pots", "I see 2", "I see 2 cooking pots", "I can count 2 cooking pots" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many power strips are in the scene?", "response_gt": [ "1", "1 power strips", "I see 1", "I see 1 power strips", "I can count 1 power strips" ], "response_pred": "three" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many cutting boards are in the scene?", "response_gt": [ "1", "1 cutting boards", "I see 1", "I see 1 cutting boards", "I can count 1 cutting boards" ], "response_pred": "2 cutting boards" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many dish drying racks are in the scene?", "response_gt": [ "1", "1 dish drying racks", "I see 1", "I see 1 dish drying racks", "I can count 1 dish drying racks" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many pot covers are in the scene?", "response_gt": [ "1", "1 pot covers", "I see 1", "I see 1 pot covers", "I can count 1 pot covers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many kitchen utensils are in the scene?", "response_gt": [ "1", "1 kitchen utensils", "I see 1", "I see 1 kitchen utensils", "I can count 1 kitchen utensils" ], "response_pred": "2 kitchen utensils" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many kettles are in the scene?", "response_gt": [ "1", "1 kettles", "I see 1", "I see 1 kettles", "I can count 1 kettles" ], "response_pred": "2 kettles" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many stools are in the scene?", "response_gt": [ "1", "1 stools", "I see 1", "I see 1 stools", "I can count 1 stools" ], "response_pred": "2 stools" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many water pitchers are in the scene?", "response_gt": [ "1", "1 water pitchers", "I see 1", "I see 1 water pitchers", "I can count 1 water pitchers" ], "response_pred": "2 water pitchers" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many soap dispensers are in the scene?", "response_gt": [ "1", "1 soap dispensers", "I see 1", "I see 1 soap dispensers", "I can count 1 soap dispensers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many electric stoves are in the scene?", "response_gt": [ "1", "1 electric stoves", "I see 1", "I see 1 electric stoves", "I can count 1 electric stoves" ], "response_pred": "2 electric stoves" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "4", "4 boxs", "I see 4", "I see 4 boxs", "I can count 4 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many bookss are in the scene?", "response_gt": [ "1", "1 bookss", "I see 1", "I see 1 bookss", "I can count 1 bookss" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many baskets are in the scene?", "response_gt": [ "1", "1 baskets", "I see 1", "I see 1 baskets", "I can count 1 baskets" ], "response_pred": "4 baskets" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many electric pots are in the scene?", "response_gt": [ "1", "1 electric pots", "I see 1", "I see 1 electric pots", "I can count 1 electric pots" ], "response_pred": "2 electric pots" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many books are in the scene?", "response_gt": [ "1", "1 books", "I see 1", "I see 1 books", "I can count 1 books" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many egg cartons are in the scene?", "response_gt": [ "1", "1 egg cartons", "I see 1", "I see 1 egg cartons", "I can count 1 egg cartons" ], "response_pred": "4 egg cartons" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many kitchen objects are in the scene?", "response_gt": [ "1", "1 kitchen objects", "I see 1", "I see 1 kitchen objects", "I can count 1 kitchen objects" ], "response_pred": "2 kitchen cabinets, 2 dishwashers, 2 refrigerators, 2 toaster ovens, 2 microwave ovens, 2 paper towel dispensers, and 1 sink" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many paperss are in the scene?", "response_gt": [ "1", "1 paperss", "I see 1", "I see 1 paperss", "I can count 1 paperss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many jars are in the scene?", "response_gt": [ "2", "2 jars", "I see 2", "I see 2 jars", "I can count 2 jars" ], "response_pred": "6 jars" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many duvets are in the scene?", "response_gt": [ "1", "1 duvets", "I see 1", "I see 1 duvets", "I can count 1 duvets" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many ceiling lights are in the scene?", "response_gt": [ "1", "1 ceiling lights", "I see 1", "I see 1 ceiling lights", "I can count 1 ceiling lights" ], "response_pred": "1" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many electric ducts are in the scene?", "response_gt": [ "1", "1 electric ducts", "I see 1", "I see 1 electric ducts", "I can count 1 electric ducts" ], "response_pred": "2 electric ducts" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "1", "1 windows", "I see 1", "I see 1 windows", "I can count 1 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "27dd4da69e", "instruction": " USER: How many robot vaccuum cleaners are in the scene?", "response_gt": [ "1", "1 robot vaccuum cleaners", "I see 1", "I see 1 robot vaccuum cleaners", "I can count 1 robot vaccuum cleaners" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many monitors are in the scene?", "response_gt": [ "2", "2 monitors", "I see 2", "I see 2 monitors", "I can count 2 monitors" ], "response_pred": "2 monitors" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many pictures are in the scene?", "response_gt": [ "6", "6 pictures", "I see 6", "I see 6 pictures", "I can count 6 pictures" ], "response_pred": "2 pictures" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many closets are in the scene?", "response_gt": [ "1", "1 closets", "I see 1", "I see 1 closets", "I can count 1 closets" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many doors are in the scene?", "response_gt": [ "1", "1 doors", "I see 1", "I see 1 doors", "I can count 1 doors" ], "response_pred": "2 doors" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many beds are in the scene?", "response_gt": [ "1", "1 beds", "I see 1", "I see 1 beds", "I can count 1 beds" ], "response_pred": "2 beds" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many office chairs are in the scene?", "response_gt": [ "1", "1 office chairs", "I see 1", "I see 1 office chairs", "I can count 1 office chairs" ], "response_pred": "4 office chairs" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many bags are in the scene?", "response_gt": [ "3", "3 bags", "I see 3", "I see 3 bags", "I can count 3 bags" ], "response_pred": "2 bags" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many objects are in the scene?", "response_gt": [ "32", "32 objects", "I see 32", "I see 32 objects", "I can count 32 objects" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many nightstands are in the scene?", "response_gt": [ "1", "1 nightstands", "I see 1", "I see 1 nightstands", "I can count 1 nightstands" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many books are in the scene?", "response_gt": [ "1", "1 books", "I see 1", "I see 1 books", "I can count 1 books" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many boxs are in the scene?", "response_gt": [ "2", "2 boxs", "I see 2", "I see 2 boxs", "I can count 2 boxs" ], "response_pred": "2 boxes" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many shelfs are in the scene?", "response_gt": [ "2", "2 shelfs", "I see 2", "I see 2 shelfs", "I can count 2 shelfs" ], "response_pred": "3" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many plants are in the scene?", "response_gt": [ "2", "2 plants", "I see 2", "I see 2 plants", "I can count 2 plants" ], "response_pred": "three" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many lamps are in the scene?", "response_gt": [ "1", "1 lamps", "I see 1", "I see 1 lamps", "I can count 1 lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many papers are in the scene?", "response_gt": [ "2", "2 papers", "I see 2", "I see 2 papers", "I can count 2 papers" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many plierss are in the scene?", "response_gt": [ "1", "1 plierss", "I see 1", "I see 1 plierss", "I can count 1 plierss" ], "response_pred": "2 plierss" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many caps are in the scene?", "response_gt": [ "2", "2 caps", "I see 2", "I see 2 caps", "I can count 2 caps" ], "response_pred": "3 caps" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many headphoness are in the scene?", "response_gt": [ "1", "1 headphoness", "I see 1", "I see 1 headphoness", "I can count 1 headphoness" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many headphone holders are in the scene?", "response_gt": [ "1", "1 headphone holders", "I see 1", "I see 1 headphone holders", "I can count 1 headphone holders" ], "response_pred": "2 headphone holders" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many tables are in the scene?", "response_gt": [ "1", "1 tables", "I see 1", "I see 1 tables", "I can count 1 tables" ], "response_pred": "2 tables" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many pcs are in the scene?", "response_gt": [ "1", "1 pcs", "I see 1", "I see 1 pcs", "I can count 1 pcs" ], "response_pred": "2 pcs" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many cups are in the scene?", "response_gt": [ "1", "1 cups", "I see 1", "I see 1 cups", "I can count 1 cups" ], "response_pred": "2 cups" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many post-its are in the scene?", "response_gt": [ "1", "1 post-its", "I see 1", "I see 1 post-its", "I can count 1 post-its" ], "response_pred": "2 post-its" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many notebooks are in the scene?", "response_gt": [ "1", "1 notebooks", "I see 1", "I see 1 notebooks", "I can count 1 notebooks" ], "response_pred": "3 notebooks" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many mouse pads are in the scene?", "response_gt": [ "1", "1 mouse pads", "I see 1", "I see 1 mouse pads", "I can count 1 mouse pads" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many mouses are in the scene?", "response_gt": [ "1", "1 mouses", "I see 1", "I see 1 mouses", "I can count 1 mouses" ], "response_pred": "2 mouses" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many keyboards are in the scene?", "response_gt": [ "1", "1 keyboards", "I see 1", "I see 1 keyboards", "I can count 1 keyboards" ], "response_pred": "2 keyboards" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many trash cans are in the scene?", "response_gt": [ "1", "1 trash cans", "I see 1", "I see 1 trash cans", "I can count 1 trash cans" ], "response_pred": "3 trash cans" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many blindss are in the scene?", "response_gt": [ "1", "1 blindss", "I see 1", "I see 1 blindss", "I can count 1 blindss" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many curtains are in the scene?", "response_gt": [ "1", "1 curtains", "I see 1", "I see 1 curtains", "I can count 1 curtains" ], "response_pred": "2 curtains" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many windows are in the scene?", "response_gt": [ "1", "1 windows", "I see 1", "I see 1 windows", "I can count 1 windows" ], "response_pred": "2 windows" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many ceiling lamps are in the scene?", "response_gt": [ "1", "1 ceiling lamps", "I see 1", "I see 1 ceiling lamps", "I can count 1 ceiling lamps" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many towels are in the scene?", "response_gt": [ "3", "3 towels", "I see 3", "I see 3 towels", "I can count 3 towels" ], "response_pred": "2 towels" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many shoes holders are in the scene?", "response_gt": [ "1", "1 shoes holders", "I see 1", "I see 1 shoes holders", "I can count 1 shoes holders" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many shoess are in the scene?", "response_gt": [ "2", "2 shoess", "I see 2", "I see 2 shoess", "I can count 2 shoess" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many heaters are in the scene?", "response_gt": [ "1", "1 heaters", "I see 1", "I see 1 heaters", "I can count 1 heaters" ], "response_pred": "2 heaters" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many scarfs are in the scene?", "response_gt": [ "1", "1 scarfs", "I see 1", "I see 1 scarfs", "I can count 1 scarfs" ], "response_pred": "2 scarfs" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many bedside cabinets are in the scene?", "response_gt": [ "1", "1 bedside cabinets", "I see 1", "I see 1 bedside cabinets", "I can count 1 bedside cabinets" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many bookss are in the scene?", "response_gt": [ "1", "1 bookss", "I see 1", "I see 1 bookss", "I can count 1 bookss" ], "response_pred": "2 books" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many bottles are in the scene?", "response_gt": [ "1", "1 bottles", "I see 1", "I see 1 bottles", "I can count 1 bottles" ], "response_pred": "6 bottles" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many postcardss are in the scene?", "response_gt": [ "1", "1 postcardss", "I see 1", "I see 1 postcardss", "I can count 1 postcardss" ], "response_pred": "2 postcards" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many structures are in the scene?", "response_gt": [ "1", "1 structures", "I see 1", "I see 1 structures", "I can count 1 structures" ], "response_pred": "2 structures" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many smoke detectors are in the scene?", "response_gt": [ "1", "1 smoke detectors", "I see 1", "I see 1 smoke detectors", "I can count 1 smoke detectors" ], "response_pred": "2" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many bedframes are in the scene?", "response_gt": [ "3", "3 bedframes", "I see 3", "I see 3 bedframes", "I can count 3 bedframes" ], "response_pred": "two" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many jackets are in the scene?", "response_gt": [ "1", "1 jackets", "I see 1", "I see 1 jackets", "I can count 1 jackets" ], "response_pred": "3 jackets" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many pillows are in the scene?", "response_gt": [ "2", "2 pillows", "I see 2", "I see 2 pillows", "I can count 2 pillows" ], "response_pred": "4 pillows" }, { "source": "scannet", "scene_id": "c49a8c6cff", "instruction": " USER: How many comforters are in the scene?", "response_gt": [ "1", "1 comforters", "I see 1", "I see 1 comforters", "I can count 1 comforters" ], "response_pred": "2" } ]