python异步切片下载二进制文件(修复了不能加headers的问题)

y小白 2022-03-10
777
异步切片下载二进制文件
异步切片，断点续传，提高下载速度
# -*- coding: UTF-8 -*-
# @author: ylw
# @file: async_dwnload
# @time: 2022-3-8
# @desc:
import os
import time
import asyncio
import aiohttp
from retrying import retry


class Async_download:
    def __init__(self, sema_number=20):
        self.__sema_number = sema_number
        self.__sema = None

    @classmethod
    def _mkdir(cls, file_path, full_path):
        if os.path.exists(file_path):
            return True
        try:
            os.makedirs(full_path)
        except Exception as e:
            pass
        return False

    def __init_check(self, file_path: str):
        full_path, file_name = file_path.rsplit('/', 1)
        file_size = os.path.getsize(file_path) if self._mkdir(file_path, full_path) else 0
        return file_name, file_size

    @classmethod
    def __sync_save_local(cls, r_headers, results, file_path):
        done, padding = results
        for d in done:
            for index, value in d.result().items():
                r_headers[index] = value

        with open(file_path, 'ab') as f:
            for _, value in r_headers.items():
                f.write(value)
        return True

    @classmethod
    def __generate_headers(cls, headers, file_size, first_byte):
        r_headers = {}
        index = 0
        if first_byte > 51200000:
            byte = 2048000  # 2M 为一片
        else:
            byte = 1024000  # 1M 为一片
        while True:
            file_size_two = file_size + byte
            if file_size_two >= first_byte:
                r_headers[index] = {"Range": f"bytes={file_size}-{first_byte}"}
                break
            r_headers[index] = {"Range": f"bytes={file_size}-{file_size_two - 1}"}
            index += 1
            file_size = file_size_two

        for key in r_headers:
            r_headers[key].update(headers)
        return r_headers

    @retry(stop_max_attempt_number=3)
    async def __download_one(self, session, method, url, r_headers, **kwargs):
        index, headers = r_headers
        async with self.__sema:
            async with session.request(method, url, headers=headers, **kwargs) as response:
                binary = await response.content.read()
                return {index: binary}

    async def __async_section_download(self, session, method, url, r_headers, **kwargs):
        tasks = [
            asyncio.create_task(self.__download_one(session, method, url, (key, r_headers[key]), **kwargs)) for key in
            r_headers
        ]
        return await asyncio.wait(tasks)

    @classmethod
    async def __get_content_length(cls, session, method, url, headers, **kwargs):
        async with session.request(method, url, headers=headers, **kwargs) as response:
            return response.headers.get('Content-Length') or response.headers.get('content-length') or 0

    @classmethod
    async def __sync_download(cls, session, method, url, headers, file_path, **kwargs):
        async with session.request(method, url, headers=headers, **kwargs) as response:
            with open(file_path, 'wb') as f:
                binary = await response.content.read()
                f.write(binary)

    async def __async_download_main(self, method, url, headers, file_path, **kwargs):
        """
        :param url:
        :param file_path: fullpath/file_name
        :return:
        """
        file_name, file_size = self.__init_check(file_path)
        self.__sema = asyncio.Semaphore(self.__sema_number)
        async with aiohttp.ClientSession() as session:
            content_length = await self.__get_content_length(session, method, url, headers, **kwargs)

            if content_length and content_length.isdigit():
                content_length = int(content_length)
                if file_size >= content_length:
                    return True, file_path

                r_headers = self.__generate_headers(headers, file_size, content_length)
                results = await self.__async_section_download(session, method, url, r_headers, **kwargs)
                self.__sync_save_local(r_headers, results, file_path)
            else:
                await self.__sync_download(session, method, url, headers, file_path, **kwargs)

            if os.path.getsize(file_path) >= int(content_length):
                return True, file_path
            return False, file_path

    def start(self, method, url, headers, file_path, **kwargs):
        # **kwargs 参数跟 aiohttp.request() 参数一致
        return asyncio.run(self.__async_download_main(method, url, headers, file_path, **kwargs))


if __name__ == '__main__':
    as_dw = Async_download(20)
    url_ = "https://vd4.bdstatic.com/mda-nc6yygrbaqzsrd5u/hd/cae_h264_delogo/1646695221404923243/mda-nc6yygrbaqzsrd5u.mp4?v_from_s=hkapp-haokan-nanjing&auth_key=1646795766-0-0-52830eb9a3936b06fc194613d67a7fa5&bcevod_channel=searchbox_feed&pd=1&cd=0&pt=3&logid=2766416251&vid=17956560932232323908&abtest=100815_2-17451_1&klogid=2766416251"
    s = time.time()
    print(as_dw.start(
            'get',
            url_,
            {
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'},
            './text.mp4',
            cookies={}
        ))
    print(time.time() - s)
速度对比
应该是我这边网速不好，所以差距会如此之大
ps：大佬们点个赞吧
python
文章转载自y小白，如果涉嫌侵权，请发送邮件至：contact@modb.pro进行举报，并提供相关证据，一经查实，墨天轮将立刻删除相关内容。
python异步切片下载二进制文件(修复了不能加headers的问题)

评论