跳转至

为MKdocs页面添加相关文章推荐

💾 AI智能摘要 (GLM-4)

为MKdocs页面添加相关文章推荐功能,通过覆写文件夹overrides实现,新建hooks文件进行配置,提升文档阅读体验。

📖 阅读信息

阅读时间:1 分钟 | 中文字符:63 | 有效代码行数:419

步骤

mkdocs.yml中需要覆写文件夹overrides(没有的话新建一个)

1
2
3
theme:
  name: material
  custom_dir: docs/overrides

在docs/overrides/hooks/下新建一个related_posts.py文件即可,内容如下:

具体配置根据自己仓库情况自行修改

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
import os
import re
from collections import Counter, defaultdict
from textwrap import dedent
import hashlib
import yaml
from urllib.parse import urlparse

# 存储所有文章的信息和索引
article_index = {}
category_index = defaultdict(list)
keyword_index = defaultdict(set)

# 配置:需要索引的目录
INDEXED_DIRECTORIES = ['blog/', 'develop/']  

# 配置:排除推荐的页面列表(支持精确匹配和模式匹配)
EXCLUDED_PAGES = {
    # 精确路径匹配
    'blog/index.md',
    'develop/index.md',
    # 可以添加更多排除的页面
    # 'blog/special-page.md',
}

# 配置:排除推荐的页面模式(支持通配符)
EXCLUDED_PATTERNS = [
    r'.*\/index\.md$',  # 排除所有 index.md 文件
    r'.*\/archive\.md$',  # 排除所有 archive.md 文件
    r'blog\/posts?\/.*',  # 排除 blog/post/ 和 blog/posts/ 目录下的所有文章
    # 可以添加更多模式
    # r'blog\/draft\/.*',  # 排除草稿目录
]

# 配置:相似度阈值和权重
SIMILARITY_CONFIG = {
    'min_threshold': 0.15,  # 提高最低相似度阈值
    'weights': {
        'keywords': 0.35,    # 关键词权重
        'tags': 0.30,        # 标签权重
        'categories': 0.20,  # 分类权重
        'path': 0.10,        # 路径分类权重
        'source_dir': 0.05   # 源目录权重
    },
    'title_similarity': 0.25  # 标题相似度权重
}

def is_page_excluded(file_path):
    """检查页面是否应该排除推荐"""
    # 精确匹配检查
    if file_path in EXCLUDED_PAGES:
        return True

    # 模式匹配检查
    for pattern in EXCLUDED_PATTERNS:
        if re.match(pattern, file_path):
            return True

    return False

def should_index_file(file_path):
    """检查文件是否应该被索引"""
    if not file_path.endswith('.md'):
        return False

    # 先检查是否被排除
    if is_page_excluded(file_path):
        return False

    # 检查是否在指定目录下
    for directory in INDEXED_DIRECTORIES:
        if file_path.startswith(directory):
            return True

    return False

def extract_keywords(content, title):
    """提取文章中的关键词,改进算法"""
    # 移除YAML front matter
    content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', content, flags=re.DOTALL | re.MULTILINE)

    # 移除代码块
    content = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
    # 移除HTML标签
    content = re.sub(r'<.*?>', '', content)
    # 移除链接
    content = re.sub(r'\[.*?\]\(.*?\)', '', content)
    # 移除标题标记
    content = re.sub(r'^#+\s+', '', content, flags=re.MULTILINE)

    # 合并标题和内容,标题权重更高
    title_words = re.findall(r'\b\w+\b', title.lower()) * 4  # 增加标题权重
    content_words = re.findall(r'\b\w+\b', content.lower())
    all_words = title_words + content_words

    # 扩展停用词列表(包含中英文)
    stopwords = {
        # 英文停用词
        'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'and', 'or', 'is', 'are', 'was', 'were',
        'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should',
        'this', 'that', 'these', 'those', 'with', 'from', 'by', 'as', 'can', 'but', 'not', 'if', 'it',
        'they', 'them', 'their', 'you', 'your', 'we', 'our', 'my', 'me', 'i', 'he', 'she', 'him', 'her',
        # 常见无意义词
        'about', 'above', 'after', 'again', 'all', 'also', 'any', 'because', 'before', 'between',
        'both', 'each', 'few', 'first', 'get', 'how', 'into', 'just', 'last', 'made', 'make', 'may',
        'most', 'new', 'now', 'old', 'only', 'other', 'over', 'said', 'same', 'see', 'some', 'such',
        'take', 'than', 'then', 'time', 'two', 'use', 'very', 'way', 'well', 'where', 'when', 'which',
        'while', 'who', 'why', 'work', 'world', 'year', 'years',
        # 中文停用词
        '的', '了', '和', '是', '就', '都', '而', '及', '与', '这', '那', '有', '在', '中', '为', '对', '等',
        '能', '会', '可以', '没有', '什么', '一个', '自己', '这个', '那个', '这些', '那些', '如果', '因为', '所以'
    }

    # 过滤单词:长度>=2,不在停用词中,不是纯数字
    words = [w for w in all_words 
             if len(w) >= 2 and w not in stopwords and not w.isdigit()]

    # 返回词频最高的15个词
    return Counter(words).most_common(15)

def extract_metadata(content):
    """提取文章元数据,支持YAML front matter"""
    metadata = {
        'title': "未命名",
        'description': "",
        'tags': [],
        'categories': [],
        'disable_related': False  # 新增:是否禁用相关推荐
    }

    # 尝试解析YAML front matter
    yaml_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
    if yaml_match:
        try:
            yaml_content = yaml_match.group(1)
            yaml_data = yaml.safe_load(yaml_content)
            if yaml_data:
                metadata['title'] = str(yaml_data.get('title', '未命名')).strip('"\'')
                metadata['description'] = str(yaml_data.get('description', '')).strip('"\'')
                metadata['disable_related'] = yaml_data.get('disable_related', False)

                # 处理tags
                tags = yaml_data.get('tags', [])
                if isinstance(tags, list):
                    metadata['tags'] = [str(tag).strip() for tag in tags]
                elif isinstance(tags, str):
                    metadata['tags'] = [tag.strip() for tag in tags.split(',') if tag.strip()]

                # 处理categories
                categories = yaml_data.get('categories', [])
                if isinstance(categories, list):
                    metadata['categories'] = [str(cat).strip() for cat in categories]
                elif isinstance(categories, str):
                    metadata['categories'] = [cat.strip() for cat in categories.split(',') if cat.strip()]
        except yaml.YAMLError:
            pass  # 如果YAML解析失败,使用默认值

    # 如果YAML解析失败,回退到正则表达式
    if metadata['title'] == "未命名":
        title_match = re.search(r'^title:\s*(.+)$', content, re.MULTILINE)
        if title_match:
            metadata['title'] = title_match.group(1).strip('"\'')

    return metadata

def get_category_from_path(file_path):
    """从文件路径提取分类"""
    parts = file_path.split('/')
    if len(parts) > 2:
        return parts[1]  # blog/category/file.md 或 develop/category/file.md格式
    elif len(parts) > 1:
        return parts[0]  # blog 或 develop
    return "未分类"

def calculate_content_hash(content):
    """计算内容哈希,用于检测内容变化"""
    return hashlib.md5(content.encode('utf-8')).hexdigest()

def on_files(files, config):
    """预处理所有文章,建立索引"""
    global article_index, category_index, keyword_index

    # 清空索引
    article_index.clear()
    category_index.clear()
    keyword_index.clear()

    processed_count = 0
    excluded_count = 0

    for file in files:
        if should_index_file(file.src_path):
            try:
                with open(file.abs_src_path, 'r', encoding='utf-8') as f:
                    content = f.read()

                    # 提取元数据
                    metadata = extract_metadata(content)

                    # 检查是否禁用相关推荐
                    if metadata.get('disable_related', False):
                        excluded_count += 1
                        continue

                    # 再次检查是否在排除列表中(双重检查)
                    if is_page_excluded(file.src_path):
                        excluded_count += 1
                        continue

                    # 提取关键词
                    keywords = extract_keywords(content, metadata['title'])

                    # 获取分类
                    path_category = get_category_from_path(file.src_path)

                    # 构建文章信息
                    article_info = {
                        'title': metadata['title'],
                        'description': metadata['description'],
                        'tags': metadata['tags'],
                        'categories': metadata['categories'],
                        'path_category': path_category,
                        'keywords': keywords,
                        'url': file.url,
                        'path': file.src_path,
                        'content_hash': calculate_content_hash(content),
                        'source_dir': file.src_path.split('/')[0]  # blog 或 develop
                    }

                    # 添加到主索引
                    article_index[file.src_path] = article_info

                    # 添加到分类索引
                    category_index[path_category].append(file.src_path)
                    for category in metadata['categories']:
                        if category:  # 确保分类不为空
                            category_index[category].append(file.src_path)

                    # 添加到关键词索引
                    for keyword, _ in keywords:
                        keyword_index[keyword].add(file.src_path)
                    for tag in metadata['tags']:
                        if tag:  # 确保标签不为空
                            keyword_index[tag.lower()].add(file.src_path)

                    processed_count += 1

            except Exception as e:
                print(f"❌ 处理文件 {file.src_path} 时出错: {e}")

    print(f"✅ 已索引 {processed_count} 篇文章 (blog + develop)")
    if excluded_count > 0:
        print(f"📝 排除 {excluded_count} 篇禁用推荐或在排除列表中的文章")
    print(f"📊 分类数量: {len(category_index)}")
    print(f"🔤 关键词数量: {len(keyword_index)}")
    return files

def on_page_markdown(markdown, **kwargs):
    """为每篇文章添加相关推荐"""
    page = kwargs['page']
    config = kwargs['config']

    # 检查是否应该处理这个页面
    if not should_index_file(page.file.src_path):
        return markdown

    # 检查是否被排除
    if is_page_excluded(page.file.src_path):
        return markdown

    # 检查文章元数据是否禁用推荐
    try:
        with open(page.file.abs_src_path, 'r', encoding='utf-8') as f:
            content = f.read()
            metadata = extract_metadata(content)
            if metadata.get('disable_related', False):
                return markdown
    except Exception:
        pass  # 如果读取失败,继续处理

    # 获取相关文章
    related_articles = get_related_articles(page.file.src_path, max_count=5)

    if not related_articles:
        return markdown

    # 从 config 中获取 site_url 并解析出基本路径
    site_url = config.get('site_url', '')
    base_path = urlparse(site_url).path if site_url else '/'
    if not base_path.endswith('/'):
        base_path += '/'

    # 构建推荐HTML - 针对Safari浏览器优化
    recommendation_html = "\n"

    # 添加CSS样式,特别针对Safari浏览器优化
    recommendation_html += """<style>
.related-posts {
  margin-top: 1.5rem;
  padding-top: 0.75rem;
  border-top: 1px solid rgba(0,0,0,0.1);
  max-height: none !important; /* 防止Safari错误计算高度 */
  overflow: visible !important; /* 防止内容被截断 */
}
.related-posts h3 {
  margin-top: 0;
  margin-bottom: 0.5rem;
  font-size: 1.2rem;
  font-weight: 500;
  line-height: 1.3;
}
.related-posts ul {
  margin: 0 0 0.5rem 0 !important; /* 强制覆盖可能的冲突样式 */
  padding-left: 1.5rem;
  list-style-position: outside;
}
.related-posts li {
  margin-bottom: 0.25rem;
  line-height: 1.4;
}
/* 暗色模式适配 */
[data-md-color-scheme="slate"] .related-posts {
  border-top-color: rgba(255,255,255,0.1);
}
/* Safari特定修复 */
@supports (-webkit-hyphens:none) {
  .related-posts {
    display: block;
    position: relative;
    height: auto !important;
  }
  .related-posts ul {
    position: static;
  }
}
</style>
"""

    # 简化且兼容的HTML结构
    recommendation_html += '<div class="related-posts">\n'
    recommendation_html += '<h3>📚 相关文章推荐</h3>\n'
    recommendation_html += '<ul>\n'

    for score, article_info in related_articles:
        title = article_info['title']
        relative_url = article_info['url']
        # 拼接基本路径和文章相对URL,并确保路径分隔符正确
        full_url = (base_path + relative_url).replace('//', '/')
        recommendation_html += f'<li><a href="{full_url}">{title}</a></li>\n'

    recommendation_html += '</ul>\n'
    recommendation_html += '</div>\n'

    # 确保没有多余的空行
    return markdown.rstrip() + recommendation_html

def get_related_articles(current_path, max_count=5):
    """获取相关文章,使用改进的算法"""
    if current_path not in article_index:
        return []

    current_article = article_index[current_path]
    similarities = []

    # 获取当前文章的关键信息
    current_title = current_article['title'].lower()
    current_tags = set(tag.lower() for tag in current_article['tags'] if tag)
    current_categories = set(cat.lower() for cat in current_article['categories'] if cat)

    for path, article_info in article_index.items():
        if path == current_path:
            continue

        # 过滤掉标题为"未命名"的文章
        if article_info['title'] == "未命名" or not article_info['title'].strip():
            continue

        # 再次检查是否在排除列表中(双重检查)
        if is_page_excluded(path):
            continue

        # 计算相似度
        score = calculate_similarity(current_article, article_info)

        # 标题相似度加权
        title_similarity = calculate_title_similarity(current_title, article_info['title'].lower())
        if title_similarity > 0.3:  # 标题有一定相似度
            score += title_similarity * SIMILARITY_CONFIG['title_similarity']

        # 应用最低阈值
        if score > SIMILARITY_CONFIG['min_threshold']:
            similarities.append((score, article_info))

    # 按相似度排序
    similarities.sort(key=lambda x: x[0], reverse=True)

    # 多样性优化:确保不同分类的文章都有机会被推荐
    if len(similarities) > max_count * 2:
        # 按分类分组
        category_groups = defaultdict(list)
        for score, article in similarities:
            for category in article['categories']:
                if category:
                    category_groups[category.lower()].append((score, article))

        # 从每个分类中选取最相关的文章
        diverse_results = []
        used_paths = set()

        # 首先添加最相关的文章
        if similarities:
            top_score, top_article = similarities[0]
            diverse_results.append((top_score, top_article))
            used_paths.add(top_article['path'])

        # 然后从每个分类中添加最相关的文章
        for category in sorted(category_groups.keys()):
            if len(diverse_results) >= max_count:
                break

            for score, article in category_groups[category]:
                if article['path'] not in used_paths:
                    diverse_results.append((score, article))
                    used_paths.add(article['path'])
                    break

        # 如果还有空位,从剩余的高分文章中填充
        if len(diverse_results) < max_count:
            for score, article in similarities:
                if article['path'] not in used_paths and len(diverse_results) < max_count:
                    diverse_results.append((score, article))
                    used_paths.add(article['path'])

        # 重新按相似度排序
        diverse_results.sort(key=lambda x: x[0], reverse=True)
        return diverse_results[:max_count]

    return similarities[:max_count]

def calculate_title_similarity(title1, title2):
    """计算两个标题的相似度"""
    # 分词
    words1 = set(re.findall(r'\b\w+\b', title1))
    words2 = set(re.findall(r'\b\w+\b', title2))

    if not words1 or not words2:
        return 0

    # 计算Jaccard相似度
    intersection = len(words1.intersection(words2))
    union = len(words1.union(words2))

    if union == 0:
        return 0

    return intersection / union

def calculate_similarity(article1, article2):
    """计算两篇文章的相似度"""
    score = 0
    weights = SIMILARITY_CONFIG['weights']

    # 1. 关键词相似度
    keywords1 = dict(article1['keywords'])
    keywords2 = dict(article2['keywords'])
    common_keywords = set(keywords1.keys()) & set(keywords2.keys())

    if common_keywords:
        # 考虑关键词的频率和重要性
        keyword_score = sum(min(keywords1[kw], keywords2[kw]) for kw in common_keywords)
        # 关键词匹配数量的奖励
        keyword_count_bonus = len(common_keywords) / max(len(keywords1), 1) * 0.5
        score += (keyword_score + keyword_count_bonus) * weights['keywords']

    # 2. 标签相似度
    tags1 = set(tag.lower() for tag in article1['tags'] if tag)
    tags2 = set(tag.lower() for tag in article2['tags'] if tag)

    if tags1 and tags2:  # 确保两篇文章都有标签
        tag_overlap = len(tags1 & tags2)
        tag_ratio = tag_overlap / max(len(tags1), 1)  # 相对重叠比例
        tag_score = tag_overlap * 8 * (1 + tag_ratio)  # 增加重叠比例奖励
        score += tag_score * weights['tags']

    # 3. 分类相似度
    categories1 = set(cat.lower() for cat in article1['categories'] if cat)
    categories2 = set(cat.lower() for cat in article2['categories'] if cat)

    if categories1 and categories2:  # 确保两篇文章都有分类
        category_overlap = len(categories1 & categories2)
        category_ratio = category_overlap / max(len(categories1), 1)
        category_score = category_overlap * 12 * (1 + category_ratio)
        score += category_score * weights['categories']

    # 4. 路径分类相似度
    if article1['path_category'] == article2['path_category']:
        score += 3 * weights['path']

    # 5. 同源目录加分
    if article1.get('source_dir') == article2.get('source_dir'):
        score += 2 * weights['source_dir']

    return score

效果如下