weiduhuo / Bangumi-To Romaji Title

// ==UserScript==
// @name         Bangumi-To Romaji Title
// @name:zh-CN   班固米-获取条目罗马字标题
// @version      0.3.5
// @description  Retrieve the Romaji title of the subject and display it in the infobox
// @author       weiduhuo
// @namespace    https://github.com/weiduhuo/scripts
// @match        *://bgm.tv/subject/*
// @match        *://bangumi.tv/subject/*
// @match        *://chii.in/subject/*
// @grant        none
// @license      MIT
// @description:zh-CN  基于MyAnimeList的非官方API Jikan,获取条目的罗马字标题,并呈现于infobox
// ==/UserScript==

(function () {
  'use strict';
  const SCRIPT_NAME = '班固米-罗马名获取组件';

  /** 单次查询返回的结果数量上限 */
  const QueryLimit = 10;
  /** 相关度的最低采用阈值 (含自身) */
  const minRelThr = 2.5;
  /** 相关度的触发再尝阈值 (不含自身) */
  const retryRelThr = 7.5;

  /** 枚举启用状态 */
  const EnableState = {
    /** 全部 */
    ALL_ENABLED: 'allEnabled',
    /** 仅中日 */
    ONLY_CJ: 'onlyChinese&Japanese',
    /** 仅日文 */
    ONLY_JAPANESE: 'onlyJapanese',
  };

  /** 启用状态 */
  let enableState = EnableState.ONLY_CJ;

  /** 所支持的条目类型 */
  const SubjectType = ['anime'];

  /** 地区待选标签 */
  const RegionTags = [
    ['中国', '国产'],
    ['日本', '日本动画'],
  ];
  const Region = {
    cn: 1,
    jp: 2,
    parse(value) {
      for (const [k, v] of Object.entries(this)) if (value === v) return k;
      return 'null';
    }
  };

  /** 媒体类型映射 BGM to MAL */
  const PlatformMap = {
    'anime': {
      'TV': 'tv', // ['tv','tv_special'] 将通过 subTags 尝试区分
      '剧场版': 'movie',
      'OVA': 'ova',
      'WEB': '', // ['ona','music','special','cm','pv',...] 一对多,但 Jikan API 不支持多参数,因此空缺转而全范围搜索
                 // 'music' 将通过 subTags 尝试区分
      '动态漫画': '',
    },
  };

  /** 匹配假名 */
  const KanaRe = /[\p{sc=Hiragana}\p{sc=Katakana}]/u;
  /** 匹配汉字与假名 */
  const ZnJpRe = /[\p{sc=Hiragana}\p{sc=Katakana}\u30FC\u31F0-\u31FF\uFF61-\uFF9F\p{sc=Han}]/u;
  /** 匹配仅包含拉丁字母与符号 */
  // const OnlyLatinRe = /^[\s\u0020-\u00FF\u2000-\u206F\u2150-\u218F\u25A0-\u26FF\u3000-\u301E\uFE30-\uFF65\uFFE0-\uFFEF]+$/;
  // 上述分别匹配了空白字符、基本拉丁字母及补充、常用标点符号、数字形式、几何图形及杂项符号、CJK常见标点符号、 CJK兼容符号

  /** 匹配标题前缀 */
  const PrefTitleRe = /^(((劇場版?)?総集|短)編|(映画|劇場|同人)版?)\s*|.?(Official )?Music Video.?/i;
  /** 匹配标题后缀 */
  const SuffTitleRe = /\s[^\s]*(版)$/;
  // const SuffTitleRe = /\s[^\s]*([\d\u2150-\u218F][^\s]*|版)$/;
  /** 匹配标题短语 */
  const PhrasesRe = /(((劇場版?)?総集|短)編|(映画|劇場|同人)版?)|(\d+)|([a-z]{2,}|[之的]|(?:(?![之的])[\p{sc=Han}])+|[\p{sc=Hiragana}\u30FC]+|[\p{sc=Katakana}\u30FC\u31F0-\u31FF\uFF61-\uFF9F]+)/ug;
  // 注意 \p{scx=Han} 会匹配 '『』'符号
  /** 匹配标题短语分层过滤 */
  const PhrasesFilterRe = /^(映画|アニメ|第|st|nd|rd|th|season|章)$/;
  /** 匹配标题内符号 */
  const PunctRe = /[\u2000-\u206F\u25A0-\u26FF\u3000-\u301E\uFE30-\uFF65\uFFE0-\uFFEF]/g;

  async function main() {
    const subType = getSubjectType();
    if (!SubjectType.includes(subType)) return;

    // 基于条目地区,判断是否启用功能
    const infobox = document.querySelector('#infobox');
    const rawTitle = getSubjectTitle();
    const isLatinTitle = !ZnJpRe.test(rawTitle); // 判断标题是否仅包含拉丁字母
    const subTags = getSubjectTags();
    let region = includeTargetTag(subTags, ...RegionTags);
    if (!region) {
      // 通过标题与角色名进行兜底 (公共标签未完全覆盖)
      if (KanaRe.test(rawTitle) || charNameHasKana() || KanaRe.test(getSubjectSummary())) {
        region = Region.jp;
      } else {
        if (enableState === EnableState.ALL_ENABLED) addTitle(infobox, region, isLatinTitle, rawTitle);
        return;
    }}
    if (region === Region.cn && enableState === EnableState.ONLY_JAPANESE) return;

    // 添加待定的名称
    const titleLis = addTitle(infobox, region, isLatinTitle);

    // 尝试先通过 sessionStorage 获取已存储的数据
    const [, id] = getSubjectId();
    const key = `subtitle-${id}`;
    let data = sessionStorage.getItem(key);
    // data = null;
    if (data) {
      data = JSON.parse(data);
      updateTitle(titleLis, [data.romaji, data.english]);
      console.log(`${SCRIPT_NAME}:`, {
        'relScore': data.relScore,
        'romaji': data.romaji,
        'english': data.english,
      });
      if(data.url) console.log(`${SCRIPT_NAME}:`, data.url); // url单独打印,以可直接点击
      return;
    }

    // 初步解析网页数据 (用于API查询的数据优先)
    let platform = getPlatform(subType);
    if (includeTargetTag(subTags, ['MV'])) {
      platform = 'music';
    }
    if (platform === 'tv' && includeTargetTag(subTags, ['OVA', 'SP', 'TVSP'], ['MV']) === 1) {
      platform = 'tv_special';
    }
    const tips = infobox.querySelectorAll('span.tip');
    const startDate = getStartDate(infobox, tips);
    let episodes, notFirstPart; // 延后解析

    // 尝试获取名称
    let subs, relScore, titles, url, mainTitle, phraseSet;
    mainTitle = rawTitle.replace(PrefTitleRe, ''); // 修复 Jikan API 首字符匹配权重过大的问题
    mainTitle = mainTitle.replace(PunctRe, ' '); // 修复 Jikan API 对诸如「」等符号匹配权重过大的问题
    const queryStartDate = startDate ? `${startDate.year - 1}-01-01` : ''; // 保守起见,仅精确到年份,并回退一年
    await handlerQuery(platform, queryStartDate);
    const data_1 = packData();
    if (relScore >= minRelThr ) {
      updateTitle(titleLis, titles);
      sessionStorage.setItem(key, JSON.stringify(data_1));
    }
    if (relScore >= retryRelThr) return;

    // 相关度较低,扩大搜索范围
    console.log(`${SCRIPT_NAME}:相关度较低,扩大搜索范围,再次尝试`);
    const preCl = relScore;
    mainTitle = mainTitle.replace(SuffTitleRe, ''); // 删除如 'シーズン2' 的后缀,只保留主标题
    await handlerQuery();
    // 由于搜索的平台范围扩大,降低相关度得分
    if (platform && relScore) relScore -= 0.5;
    const data_2 = packData();
    if (relScore >= minRelThr && relScore > preCl) {
      updateTitle(titleLis, titles);
      sessionStorage.setItem(key, JSON.stringify(data_2));
    } else if (relScore < minRelThr && preCl < minRelThr ) {
      updateTitle(titleLis, ['NULL', 'NULL']);
    }

    function packData() {
      const data = {
        'relScore': relScore / 10,
        'romaji': titles[0],
        'english': titles[1],
      };
      console.log(`${SCRIPT_NAME}:`, data);
      if (url) console.log(`${SCRIPT_NAME}:`, url);
      data.url = url;
      return data;
    }

    /** 执行一次查询 */
    async function handlerQuery(_platform = '', _startDate ='') {
      const promise = querySubject(mainTitle, subType, _platform, _startDate);
      // 同步解析网页数据 (减少忙等API)
      episodes ??= getEpisodes(infobox, tips);
      // 判断首集序号是否为开头,防止 BGM 与 MyAnimeList 条目合并不同
      notFirstPart ??= !isFirstPart();
      phraseSet = getPhraseSet(mainTitle);
      if (!titles) {
        console.log(`${SCRIPT_NAME}:`, {
          'region': Region.parse(region),
          'platform': platform,
          'episodes': episodes,
          'startDate': startDate,
          'phraseSet': phraseSet,
        });
      }
      subs = await promise;
      if (Array.isArray(subs)) {
        [relScore, titles, url] = searchSubject(subs, phraseSet, isLatinTitle, startDate, episodes);
      } else {
        [relScore, titles, url] = [0, [subs, subs], null];
      }
      // 对于非首Part的条目的开播时间参考的相关度降低
      if (notFirstPart) relScore *= 0.75;
      titles = titles.map((title) => title.replace(/\s\((TV|OVA)\)/, '')); // 删除后缀
    }
  }

  /**
   * 通过条目原标题获取相关条目数据集
   * @param {string} title 原标题
   * @param {string} subType 条目类型
   * @param {string} platform 媒体类型
   * @param {string} startDate 起始日期 'Y-m-d'
   * @param {number} limit 指定返回的结果数量
   * @returns {Promise<string | Array<Object>>} 条目数据集
   */
  async function querySubject(title, subType, platform, startDate, limit = QueryLimit) {
    const url = new URL(`https://api.jikan.moe/v4/${subType}`);
    url.searchParams.set('limit', limit);
    url.searchParams.set('q', title);
    if (platform) url.searchParams.set('type', platform);
    if (startDate) url.searchParams.set('start_date', startDate);
    try {
      console.time(`Jikan API`);
      const response = await fetch(url);
      console.timeEnd(`Jikan API`);
      const data = await response.json();
      const subs = data.data;
      if (!subs || subs.length === 0) return 'NULL';
      else return subs;
    } catch (error) {
      console.error('Jikan API请求失败:', error);
      return 'ERROR';
    }
  }

  /**
   * @param {Array<Object>} subs
   * @param {Set<string>} phraseSet
   * @param {boolean} isLatinTitle
   * @param {{year: number, month: number, day: number} | null} startDate 开播时间
   * @param {number | null} episodes 集数
   * @returns {[number, [Object], string]} [`relScore`, [`romaji`, `english`], `url`]
   *   - `relScore`相关度 - 10分值,6分为原标题短语的匹配度,4分为开播时间与集数的匹配度
   */
  function searchSubject(subs, phraseSet, isLatinTitle, startDate, episodes) {
    const tmpSubs = [];
    console.groupCollapsed(`${SCRIPT_NAME}:详情`);
    subs.forEach((sub, index) => {
      let similarity;
      // 计算 jaccard 相似度
      if (!isLatinTitle) {
        similarity = jaccardSimilarity(phraseSet, sub.title_japanese);
      } else {
        // 当搜索词全为拉丁字母时,同时考虑罗马音标题与英文标题
        similarity = Math.max(
          jaccardSimilarity(phraseSet, sub.title),
          jaccardSimilarity(phraseSet, sub.title_english)
        );
      }
      sub.relScore = similarity * 6;
      // 旧方法难以区分 '日常 Eテレ版' 2012-1 在 ['日常' 2011-4, '男子高校生の日常' 2012-1 ]
      /* const title = (isLatinTitle ? sub.title : sub.title_japanese).toLowerCase();
      const simScore = phraseSet.keys().reduce((acc, val) => acc + title.includes(val), 0);
      sub.relScore = simScore * 6 / phraseSet.size; */
      if (sub.relScore) tmpSubs.push(sub);
      sub.index = index;
      console.log({
        'index': index,
        'simScore': sub.relScore,
        'type': sub.type,
        'startDate': sub.aired.from ? sub.aired.from.split('T')[0] : null,
        'episodes': sub.episodes,
        'japanese': sub.title_japanese,
        'romanji': sub.title,
        'english': sub.title_english,
        'url': sub.url,
      });
    });
    if (tmpSubs.length) {
      subs = tmpSubs;
      subs.sort((a, b) => b.relScore - a.relScore);
    }
    if (!startDate) {
      return getResult(1, subs);
    }
    const sameYearSubs = subs.filter(sub => sub.aired.prop.from.year === startDate.year);
    if (sameYearSubs.length === 0) {
      return getResult(0, subs);
    }
    const sameMonthSubs = sameYearSubs.filter(sub => startDate.month && sub.aired.prop.from.month === startDate.month);
    if (sameMonthSubs.length === 0) {
      return getResult(2, sameYearSubs);
    }
    if (sameMonthSubs.length === 1) {
      return getResult(4, sameMonthSubs);
    }
    if (!episodes) {
      return getResult(3, sameMonthSubs);
    }
    // 开播时间相同的有多个,寻找集数差异最小的
    let minDiff = Infinity, index = 0;
    sameMonthSubs.forEach((sub, _index) => {
      const diff = Math.abs(episodes - sub.episodes);
      if (diff < minDiff) {
        minDiff = diff;
        index = _index;
      }
    });
    sameMonthSubs[index].relScore += 2;
    return getResult(2, sameMonthSubs);

    /** 计算最终相似度,并获取结果 */
    function getResult(offset, _subs) {
      _subs.forEach(sub => { sub.relScore += offset });
      subs.sort((a, b) => b.relScore - a.relScore);
      // console.groupEnd();
      console.log('sortedByRelScore:', subs.map(sub => [sub.index, sub.relScore]));
      const sub = subs[0];
      console.log('result:', sub.index);
      console.groupEnd();
      // console.log(sub);
      sub.title_english ??= 'NULL';
      return [sub.relScore, [sub.title, sub.title_english], sub.url];
    }
  }

  function jaccardSimilarity(set1, str2) {
    const set2 = getPhraseSet(str2);
    const intersection = new Set([...set1].filter(x => set2.has(x)));
    const union = new Set([...set1, ...set2]);
    return intersection.size / union.size;
  }

  function getPhraseSet(title) {
    if (!title) return new Set();
    const phrases = title.toLowerCase().match(PhrasesRe);
    if (!phrases) return new Set();
    return new Set(phrases
      .filter((s) => !PhrasesFilterRe.test(s))
    );
  }

  function getSubjectType() {
    return document.querySelector('#navMenuNeue .focus').getAttribute('href').split('/')[1];
  }

  function getSubjectTitle() {
    return document.querySelector('#headerSubject > h1 > a').textContent.trim();
  }

  function getSubjectTags() {
    return document.querySelectorAll('.subject_tag_section > .inner span');
  }

  function getSubjectSummary() {
    return document.querySelector('#subject_summary').textContent;
  }

  function getSubjectId() {
    const urlPattern = /^\/(.+)\/(\d+)$/;
    const match = window.location.pathname.match(urlPattern);
    if (!match) return [null, null];
    const [, patternType, subId] = match;
    return [patternType, subId];
  }

  /**
   * @param {NodeListOf<Element>} subTags
   * @param {...Array<string>} targetTypeTags 目标种类的标签
   * @returns {number} 种类编号由1开始,0表不存在
   */
  function includeTargetTag(subTags, ...targetTypeTags) {
    for (const tag of subTags) {
      const _tag = tag.textContent.trim();
      for (const [type, targetTags] of targetTypeTags.entries()) {
        if (targetTags.includes(_tag)) return type + 1;
      }
    }
    return 0;
  }

  function charNameHasKana() {
    const chars = document.querySelectorAll('#browserItemList strong');
    for (const char of chars) {
      if (KanaRe.test(char.innerText)) return true;
    }
    return false;
  }

  function getPlatform(subType) {
    const smallTag = document.querySelector('#headerSubject > h1 > small.grey');
    if (smallTag) {
      const platform = smallTag.innerText.trim();
      return platform in PlatformMap[subType] ? PlatformMap[subType][platform] : '';
    } else return '';
  }

  function isFirstPart() {
    const firstEp = document.querySelector('#subject_detail > .subject_prg > .prg_list > li:first-child');
    if (firstEp) {
      return ['00', '01'].includes(firstEp.innerText.trim());
    } else return true;
  }

  /**
   * @param {HTMLElement} infobox
   * @param {NodeListOf<HTMLElement>} tips
   * @returns {number | null}
   */
  function getEpisodes(infobox, tips) {
    const limit = 10;
    let ep = null;
    for (const [i, tip] of tips.entries()) {
      if (i > limit) return null;
      if (tip.innerText.trim() === '话数:') {
        ep = tip;
        break;
      }
    }
    if (!ep) return null;
    while (ep.parentElement !== infobox) {
      ep = ep.parentElement;
    }
    const match = ep.textContent.match(/(\d+)/);
    if (match) return +match[1];
    else return null;
  }

  /**
   * @param {HTMLElement} infobox
   * @param {NodeListOf<HTMLElement>} tips
   * @param {number} region
   * @returns {{year: number, month: number, day: number} | null}
   */
  function getStartDate(infobox, tips, region) {
    const regex = /(开始|(?:放送|播出)(?:开始|日期))|([上公]映(?!许可))|(发售)/;
    // 优先级 1 > 2 > 3 其大部分时是准确的
    let date = null;
    let preIndex = 10, index;
    for (const tip of tips) {
      const match = tip.innerText.match(regex);
      if (match) {
        for (const [i, m] of match.slice(1, 4).entries()) if (m) {
          index = i + 1; break;
        }
        if (index < preIndex) {
          date = tip; // 仅优先级更高的才可覆盖
          preIndex = index;
        }
        if (index === 1) break;
      }
    }
    if (!date) return null;
    while (date.parentElement !== infobox) {
      date = date.parentElement;
    }
    const dateText = date.textContent;
    let match;
    if (region === Region.jp) {
      // 优先匹配日本时间
      match = dateText.match(/日本[^))]*(\d{4})[-/年]?(\d{1,2})?[-/月]?(\d{1,2})?[-/日]?/);
      match ??= dateText.match(/(\d{4})[-/年]?(\d{1,2})?[-/月]?(\d{1,2})?[-/日]?[\s((]+日本/);
    }
    match ??= dateText.match(/(\d{4})[-/年]?(\d{1,2})?[-/月]?(\d{1,2})?[-/日]?/);
    if (match) {
      return {
        year: +match[1],
        month: +match[2],
        day: +match[3]
      };
    } else return null;
  }

  /**
   * @param {HTMLElement} infobox
   * @param {number} region
   * @param {boolean} isLatinTitle 
   * @param {string} [title='···']
   * @returns {[HTMLElement]}
   */
  function addTitle(infobox, region, isLatinTitle, title = '···') {
    const romajiLi = document.createElement('li');
    let romajiTip, englishLi = null;
    if (!region) {
      romajiTip = '索引名';
    } else {
      if (isLatinTitle) {
        romajiTip = '索引名';
      } else if (region === Region.jp) {
        romajiTip = '罗马名';
      } else if (region === Region.cn) {
        romajiTip = '拼音名';
      }
      englishLi = document.createElement('li');
      englishLi.className = 'folded';
      englishLi.innerHTML = `<span class="tip" style="user-select: none">英文名: </span>${title}`;
    }
    romajiLi.innerHTML = `<span class="tip" style="user-select: none">${romajiTip}: </span>${title}`;

    const firstLi = infobox.children[0];
    const tip = firstLi.querySelector('span.tip');
    const ref = tip && tip.innerText.trim() === '中文名:' ? firstLi.nextSibling : firstLi;
    infobox.insertBefore(romajiLi, ref);
    if (region) {
      infobox.insertBefore(englishLi, ref);
      return [romajiLi, englishLi];
    } else {
      return [romajiLi];
    }
  }

  /**
   * @param {[HTMLElement]} lis
   * @param {[string]} titles
   */
  function updateTitle(lis, titles) {
    lis.forEach((li, index) => {
      li.childNodes[1].textContent = titles[index];
    })
  }

  main();

})();