Python迁移图床到阿里OSS

随着weibo对外链的封锁越来越严重，导致很多图片无法访问，因此最近将所有的图片迁移到阿里oss。虽然weibo图片很多无法直接访问，好在设置referer为weibo.com之后都能正常访问，所以图片至少没有丢失。

新建阿里oss bucket

首先得利用阿里oss对象储存来存放图片，教程可以参考下面的博文：

Python脚本一次性迁移图片

由于图片数量较多，涉及到很多文章，所以采用脚本一次性扫描所有文章，下载好图片之后，然后上传到oss，最后用新的url来替换所有老的weibo图片地址。

代码如下，主要流程就是扫描md文件，解析图片url，下载图片，上传图片到oss，最后根据mapping表来替换所有图片地址。

import re
import requests
import os
import oss2
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import uuid

def generate_file_name():
    date_str = datetime.now().strftime("%Y%m%d%H%M%S")
    uuid_str = str(uuid.uuid4())[0:5]
    return date_str + "_" + uuid_str + ".jpg"

# 获取md文件中的图片链接,保存为文件，上传至oss
def get_pic_url(filename):
    url_map = {}
    with open(filename, 'r',) as f:
        content = f.read()
        img_patten = r'!\[.*?\]\((.*?)\)|<img.*?src=[\'\"](.*?)[\'\"].*?>'
        matches = list(re.compile(img_patten).findall(content))
        if len(matches) > 0:
            for url in matches:
                url = url[0]
                # skip the image that already replaced
                if "your_pic_domain" in url:
                    continue

                print("图片原url:", url)

                try:
                    headers = {
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML,'
                                          ' like Gecko) Chrome/55.0.2883.87 Safari/537.36',
                            'referer': 'https://weibo.com/'
                    }
                    response = requests.get(url, headers=headers).content

                    pic_name = generate_file_name()
                    pic_name = "{}/{}".format(pic_path, pic_name)
                    print("saving image " + url + " to: " + pic_name)

                    with open(pic_name, 'wb') as f2:
                        f2.write(response)
                    new_pic_url = upload_pic_to_ali_oss(pic_name)
                    if new_pic_url:
                        print("图片新url:", new_pic_url)
                        url_map[url] = new_pic_url
                    else:
                        raise Exception("upload image failed.")

                except Exception as e:
                    print("文件:", filename, "url处理失败:", url, e)
                    with open("error_images.text", 'a') as ef:
                        ef.write("文件: " + filename + ", url处理失败: " + url + "\n")

    print(url_map)
    return url_map


# 获取所有的md文件
def list_file(files, path):
    # 取出指定路径下的所有文件，包含所有子目录里的文件
    items = os.listdir(path)
    for i in items:
        i_path = os.path.join(path, i)
        if os.path.isdir(i_path):
            list_file(files, i_path)
        else:
            if i_path.endswith(".md"):
                files.append(i_path)
    return files


def upload_pic_to_ali_oss(file_path):
    auth = oss2.Auth('token', 'token')
    bucket = oss2.Bucket(auth, 'http://oss-cn-shanghai.aliyuncs.com', 'your_bucket_name')
    try:
        print("uploading " + file_path)
        # object full path: not contains bucket name, local file path: the full path
        bucket_path = 'bucket_path' + os.path.basename(file_path)
        bucket.put_object_from_file(bucket_path, file_path)
        return 'https://your_domain' + bucket_path
    except Exception as e:
        print('upload image to ali oss failed:', e)
        return None

# 替换md文件中的旧链接
def modify_md(filename, url_map):
    try:
        with open(filename, "r") as f:
            content = f.read()
        for url, new_pic_url in url_map.items():
            with open(filename, "w") as f:
                content = content.replace(url, new_pic_url)
                f.write(content)
    except Exception as e:
        print(filename, '文件修改失败:', e)

def run(file):
    # {old_url: new_url}
    print("[ " + file + " ]" + ": download images and upload to qiniu.")
    url_map = get_pic_url(file)
    if len(url_map.keys()) > 0:
        print("[ " + file + " ]" + ": replace images.")
        modify_md(file, url_map)

def main(path):
    # 获取所有的md文件
    files = list_file([], path)
    if len(files) > 0:
        th_pool = ThreadPoolExecutor(4)
        for file in files:
            th_pool.submit(run, file)
        th_pool.shutdown(wait=True)
    else:
        print("no markdown found, exit")


if __name__ == "__main__":
    md_path = "./md"
    pic_path = "./pic"
    if not os.path.exists(pic_path):
        os.makedirs(pic_path)
    main(md_path)

本文由『后端精进之路』原创，首发于博客 http://teckee.github.io/ , 转载请注明出处