https://tianchi.aliyun.com/s/7e720100fbd31e382869f7fd4a41908c
赛题名称:2023全球智能汽车AI挑战赛——赛道二:智能驾驶汽车虚拟仿真视频数据理解 赛题类型:视频理解与视频问答
赛题背景
当前,全球新一轮科技革命和产业变革蓬勃发展,汽车与人工智能技术加速融合,电动化、网联化、智能化成为汽车产业的发展潮流和趋势,AI技术将更广泛地和汽车产业的各个领域,应用于汽车的智能维护、智能制造、智能驾驶等诸多方面。
作为人工智能技术和汽车产业先进技术的倡导者,吉利汽车集团、阿里云、NVIDIA 英伟达一直致力于推动未来出行方式的发展,共同发起了本届2023全球智能汽车AI挑战赛。本届比赛将汇聚来自全球各地的杰出AI领域人才,推动自动驾驶、AI大模型、加速计算、云计算技术三者深度结合,为未来智能出行提供更加安全、高效、舒适的解决方案。
赛题任务
输入:元宇宙仿真平台生成的前视摄像头虚拟视频数据(8-10秒左右); 输出:对视频中的信息进行综合理解,以指定的json文件格式,按照数据说明中的关键词(key)填充描述型的文本信息(value,中文/英文均可以);
评价方法
选手报名成功后,参赛队伍通过天池平台下载视频数据样例,并在本地调试算法,和在线提交结果。
在初赛期间,主办方发布部分参考视频(选手需要自行准备用于模型训练所需要的数据)和测试数据集供选手评估模型效果。
参赛选手可以手动提交评估结果文件,系统会实时评估并返回成绩。每个队伍每天的提交次数限制为3次。
赛题baseline
快速跑通全流程,我们基于百度AI Studio,将本教程Baseline部署在线上平台,可一键fork运行代码,看到成绩。
一键运行:https://aistudio.baidu.com/projectdetail/7033846
# 导入环境
import paddle
from PIL import Image
from clip import tokenize, load_model
import glob, json, os
import cv2
from PIL import Image
from tqdm import tqdm_notebook
import numpy as np
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
# 加载CLIP模型
model, transforms = load_model('ViT_B_32', pretrained=True)
# 定义有的文本标签
en_match_words = {
"scerario": ["cityroad", "urban area", "suburb", "tunnel", "parking lot", "gas station/charging station"],
"weather": ["sunny", "rainy", "cloudy", "foggy", "snowy"],
"period": ["daytime", "night", "dawn/dusk"],
"road_structure": ["intersection", "T-junction", "on-ramp/off-ramp", "merge lane", "enter/exit parking lot", "roundabout", "regular lane", "unknown"],
"general_obstacle": ["speed bump", "water horse", "gravel/stones", "manhole cover", "speed bump", "none"],
"abnormal_condition": ["oil stains/water stains", "water accumulation", "cracks", "uneven surface", "none", "unknown"],
"ego_car_behavior": ["straight", "turning left", "turning right", "stop", "U-turn", "accelerate", "decelerate", "change lanes", "other"],
"closest_participants_type": ["pedestrian", "small car", "truck", "traffic police", "none", "unknown", "other"],
"closest_participants_behavior": ["go straight", "turn left", "turn right", "stop", "make a U-turn", "accelerate", "decelerate", "change lanes", "other"]
}
submit_json = {
"author" : "abc" ,
"time" : "231011",
"model" : "model_name",
"test_results" : []
}
paths = glob.glob('./初赛测试视频/*')
paths.sort()
# 对每个视频
for video_path in paths:
print(video_path)
# 读取视频的第一帧
clip_id = video_path.split('/')[-1]
cap = cv2.VideoCapture(video_path)
img = cap.read()[1]
image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
image = Image.fromarray(image)
image = transforms(Image.open(img_path)).unsqueeze(0)
single_video_result = {
"clip_id": clip_id,
"scerario" : "cityroad",
"weather":"unknown",
"period":"night",
"road_structure":"ramp",
"general_obstacle":"nothing",
"abnormal_condition":"nothing",
"ego_car_behavior":"turning right",
"closest_participants_type":"passenger car",
"closest_participants_behavior":"braking"
}
# 将视频的内容与待选的文本进行匹配
for keyword in en_match_words.keys():
# 这里对部分标签进行计算,其他标签使用默认值
if keyword not in ["weather", "road_structure"]:
continue
texts = np.array(en_match_words[keyword])
# 计算与图片最匹配的文本标签
with paddle.no_grad():
logits_per_image, logits_per_text = model(image, tokenize(en_match_words[keyword]))
probs = paddle.nn.functional.softmax(logits_per_image, axis=-1)
probs = probs.numpy()
single_video_result[keyword] = texts[probs[0].argsort()[::-1][0]]
submit_json["test_results"].append(single_video_result)
with open('clip_result.json', 'w', encoding='utf-8') as up:
json.dump(submit_json, up, ensure_ascii=False)
# 竞赛交流群 邀请函 #

每天大模型、算法竞赛、干货资讯

文章转载自Coggle数据科学,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。




