Comments (1)
import sys
import requests
import json
import re
import os
import urllib.parse
def get_image_filename(url):
"""从URL中提取简化后的图片文件名"""
parsed_url = urllib.parse.urlparse(url)
path = parsed_url.path
filename = os.path.basename(path)
return filename.split('#')[0] # 分离出主文件名部分
def download_image(image_url, save_path):
"""下载图片并保存到指定路径"""
try:
response = requests.get(image_url)
if response.status_code == 200:
with open(save_path, 'wb') as file:
file.write(response.content)
except Exception as e:
print(f"Error downloading image {image_url}: {e}")
def replace_image_links(markdown_content, book_id):
"""替换Markdown中的图片链接为本地链接,并下载图片"""
image_links = re.findall(r'!\[.*?\]\((.*?)\)', markdown_content)
for link in image_links:
image_name = get_image_filename(link)
download_path = f"download/{book_id}/images/{image_name}"
download_image(link, download_path)
markdown_content = markdown_content.replace(link, f"./images/{image_name}")
return markdown_content
def save_page(book_id, slug, path):
"""保存页面为Markdown文件,下载并更新图片链接"""
try:
docsdata = requests.get(
f'https://www.yuque.com/api/docs/{slug}?book_id={book_id}&merge_dynamic_data=false&mode=markdown')
if docsdata.status_code != 200:
print("Failed to download document. Page might be deleted.", book_id, slug)
return
docsjson = json.loads(docsdata.content)
markdown_content = replace_image_links(docsjson['data']['sourcecode'], book_id)
with open(path, 'w', encoding='utf-8') as file:
file.write(markdown_content)
except Exception as e:
print(f"Error saving page: {e}")
def get_book(url="https://www.yuque.com/burpheart/phpaudit"):
"""获取书籍内容并保存为Markdown文件"""
try:
docsdata = requests.get(url)
data = re.findall(r"decodeURIComponent\(\"(.+)\"\)\);", docsdata.content.decode('utf-8'))
docsjson = json.loads(urllib.parse.unquote(data[0]))
book_id = str(docsjson['book']['id'])
image_dir = f"download/{book_id}/images"
if not os.path.exists(image_dir):
os.makedirs(image_dir)
for doc in docsjson['book']['toc']:
if doc['url']:
filename = doc['title'].translate(str.maketrans('\/:*?"<>|\n\r', "___________")) + '.md'
path = f"download/{book_id}/{filename}"
save_page(book_id, doc['url'], path)
except Exception as e:
print(f"Error getting book: {e}")
if __name__ == '__main__':
if len(sys.argv) > 1:
get_book(sys.argv[1])
else:
get_book()
from yuque-crawl.
Related Issues (10)
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from yuque-crawl.