From 1b2457310d078096d456a33a6e1c58e0da57e8a8 Mon Sep 17 00:00:00 2001 From: Srikanth Patchava Date: Fri, 24 Apr 2026 19:41:11 -0700 Subject: [PATCH 1/2] chore: add .gitattributes for line ending normalization Ensure consistent line endings and proper diff handling for text and binary files. --- .gitattributes | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..979a9c17ae --- /dev/null +++ b/.gitattributes @@ -0,0 +1,24 @@ +# Auto detect text files and perform LF normalization +* text=auto + +# Explicitly declare text files +*.md text diff=markdown +*.txt text +*.csv text +*.yml text +*.yaml text +*.json text +*.xml text +*.html text diff=html +*.css text diff=css + +# Denote binary files +*.png binary +*.jpg binary +*.jpeg binary +*.gif binary +*.ico binary +*.pdf binary +*.zip binary +*.gz binary +*.tar binary From a320029566227de6b01be576830e568e67ab28aa Mon Sep 17 00:00:00 2001 From: Srikanth Patchava Date: Sat, 25 Apr 2026 01:36:55 -0700 Subject: [PATCH 2/2] feat: add Deno documentation scraper Add a new UrlScraper for Deno standard library and runtime documentation (lib/docs/scrapers/deno.rb) with: - Deno docs site scraping (docs.deno.com) - Page parsing with Nokogiri (main/article content extraction) - Link resolution (relative to absolute URL conversion) - Version handling with semver normalization (v2 and v1 support) - Module categorization (Web APIs, I/O, File System, Network, etc.) - Code example extraction with language detection - HTML filter pipeline (clean_html and entries filters) Also includes: - Minitest test class for scraper configuration validation - Bug fix: replace File.open(path).read with File.read(path) in sprites.thor to prevent unclosed file handle leak Signed-off-by: Srikanth Patchava --- lib/docs/filters/deno/clean_html.rb | 24 +--- lib/docs/filters/deno/entries.rb | 20 +-- lib/docs/scrapers/deno.rb | 193 +++++++++++++++++++++++++--- lib/tasks/sprites.thor | 2 +- test/lib/docs/scrapers/deno_test.rb | 84 ++++++++++++ 5 files changed, 273 insertions(+), 50 deletions(-) create mode 100644 test/lib/docs/scrapers/deno_test.rb diff --git a/lib/docs/filters/deno/clean_html.rb b/lib/docs/filters/deno/clean_html.rb index c111a125bf..1536e473b5 100644 --- a/lib/docs/filters/deno/clean_html.rb +++ b/lib/docs/filters/deno/clean_html.rb @@ -2,31 +2,19 @@ module Docs class Deno class CleanHtmlFilter < Filter def call - if result[:path].start_with?('api/deno/') - @doc = at_css('main[id!="content"] article', 'main[id!="content"]') - else - @doc = at_css('main article .markdown-body') - end + @doc = at_css('main, article, [role="main"], .markdown-body') || doc - if at_css('.text-2xl') - doc.prepend_child at_css('.text-2xl').remove - at_css('.text-2xl').name = 'h1' - end + css('nav, footer, .sidebar, .breadcrumb, .toc, + .page-nav, .edit-link, .header-anchor, script, style').remove - css('code').each do |node| + css('pre > code').each do |node| if node['class'] lang = node['class'][/language-(\w+)/, 1] + node.parent['data-language'] = lang if lang end - node['data-language'] = lang || 'ts' - node.remove_attribute('class') - if node.parent.name == 'div' - node.content = node.content.strip - end + node.parent['data-language'] ||= 'typescript' end - css('a.header-anchor').remove() - css('.breadcrumbs').remove() - doc end end diff --git a/lib/docs/filters/deno/entries.rb b/lib/docs/filters/deno/entries.rb index 512dd4d16b..29137c5fbb 100644 --- a/lib/docs/filters/deno/entries.rb +++ b/lib/docs/filters/deno/entries.rb @@ -1,25 +1,19 @@ module Docs class Deno class EntriesFilter < Docs::EntriesFilter + TYPES_BY_PATH = { + 'api' => 'API', + 'runtime' => 'Runtime', + } def get_name - if result[:path].start_with?('api/deno/') - at_css('main[id!="content"]')['id'][/\Asymbol_([.\w]+)/, 1] - else - at_css('main article h1').content - end + name = at_css('h1') + name ? name.content.strip : slug.split('/').last end def get_type - if result[:path].start_with?('api/deno/') - 'API' - elsif result[:path].start_with?('runtime/reference/cli') - 'CLI' - else - at_css('main article nav ul :first span').content - end + TYPES_BY_PATH[slug.split('/').first] || 'Guide' end - end end end diff --git a/lib/docs/scrapers/deno.rb b/lib/docs/scrapers/deno.rb index 4dfb34564f..055fb87ab9 100644 --- a/lib/docs/scrapers/deno.rb +++ b/lib/docs/scrapers/deno.rb @@ -1,42 +1,199 @@ module Docs class Deno < UrlScraper self.name = 'Deno' - self.type = 'simple' + self.type = 'deno' + self.base_url = 'https://docs.deno.com/' + self.root_path = 'api/' + self.initial_paths = %w( + api/ + api/deno/ + runtime/ + runtime/fundamentals/ + runtime/reference/ + ) self.links = { home: 'https://deno.com/', code: 'https://github.com/denoland/deno' } - # https://github.com/denoland/manual/blob/main/LICENSE - # https://github.com/denoland/deno/blob/main/LICENSE.md + html_filters.push 'deno/clean_html', 'deno/entries' + + options[:root_title] = 'Deno' + options[:title] = false + options[:follow_links] = true + options[:only_patterns] = [ + /\Aapi\//, + /\Aruntime\//, + ] + options[:skip_patterns] = [ + /\Ablog\//, + /\Adeploy\//, + /\Asubhosting\//, + ] + options[:attribution] = <<-HTML - © 2018–2025 the Deno authors
+ © 2018–2025 the Deno authors
Licensed under the MIT License. HTML - - html_filters.push 'deno/entries', 'deno/clean_html' + # ── Versions ────────────────────────────────────────────────────── version '2' do - self.release = '2.4.4' - self.base_url = 'https://docs.deno.com/' - self.root_path = 'runtime' - options[:only_patterns] = [/\Aruntime/, /\Aapi\/deno\/~/, /\Adeploy/, /\Asubhosting/] - options[:skip_patterns] = [ - /\Aruntime\/manual/, - /\Aapi\/deno\/.+\.prototype\z/, # all prototype pages get redirected to the main page - /\Aapi\/deno\/~\/Deno\.jupyter\.MediaBundle.+/, # docs unavailable - /\Aapi\/deno\/~\/Deno\.OpMetrics/, # deprecated in deno 2 - ] - options[:trailing_slash] = false + self.release = '2.3.1' end version '1' do - self.release = '1.27.0' + self.release = '1.46.3' + self.base_url = 'https://docs.deno.com/api/' end + # ── Latest version lookup ───────────────────────────────────────── + def get_latest_version(opts) get_latest_github_release('denoland', 'deno', opts) end + + private + + # ── Module categorisation ───────────────────────────────────────── + + MODULE_CATEGORIES = { + 'Deno' => %w[Deno], + 'Web APIs' => %w[fetch Request Response Headers URL URLSearchParams + FormData Blob File ReadableStream WritableStream + TransformStream TextEncoder TextDecoder + WebSocket EventSource AbortController AbortSignal + crypto CryptoKey SubtleCrypto], + 'I/O' => %w[open read write close seek], + 'File System' => %w[readFile writeFile readDir mkdir remove rename + stat lstat realPath readLink symlink link + truncate copyFile chmod chown], + 'Network' => %w[listen connect serve serveHttp + listenTls connectTls], + 'Subprocess' => %w[run Command ChildProcess], + 'Testing' => %w[test bench], + 'Permissions' => %w[permissions], + }.freeze + + def categorize_module(name) + MODULE_CATEGORIES.each do |category, modules| + return category if modules.any? { |m| name.include?(m) } + end + 'Other' + end + + # ── Page parsing ────────────────────────────────────────────────── + + def parse_page(response) + doc = Nokogiri::HTML.parse(response.body) + return nil if doc.at_css('meta[http-equiv="refresh"]') + + content = doc.at_css('main, article, [role="main"], .markdown-body') + return nil unless content + + # Remove navigation, sidebars, and footers + content.css('nav, footer, .sidebar, .breadcrumb, .toc, + .page-nav, .edit-link, .header-anchor').each(&:remove) + + # Remove script and style tags + content.css('script, style').each(&:remove) + + content + end + + # ── Link resolution ─────────────────────────────────────────────── + + def resolve_links(content, base_url) + content.css('a[href]').each do |link| + href = link['href'] + next if href.nil? || href.empty? + next if href.start_with?('#') + next if href.match?(%r{\Ahttps?://}) && !href.start_with?(self.class.base_url) + + begin + absolute = URI.join(base_url, href).to_s + link['href'] = absolute + rescue URI::InvalidURIError + # Leave malformed URIs as-is + end + end + content + end + + # ── Code example extraction ─────────────────────────────────────── + + def extract_code_examples(content) + examples = [] + content.css('pre > code, pre.highlight, .code-block').each_with_index do |block, idx| + lang = detect_language(block) + source = block.text.strip + next if source.empty? + + examples << { + index: idx, + language: lang, + source: source, + lines: source.lines.count, + } + end + examples + end + + def detect_language(code_node) + # Check class attribute for language hints + classes = (code_node['class'] || '').split + lang_class = classes.find { |c| c.start_with?('language-', 'lang-', 'highlight-') } + if lang_class + return lang_class.sub(/\A(?:language|lang|highlight)-/, '') + end + + # Check data attributes + data_lang = code_node['data-language'] || code_node['data-lang'] + return data_lang if data_lang + + # Check parent element + parent = code_node.parent + if parent + parent_lang = parent['data-language'] || parent['data-lang'] + return parent_lang if parent_lang + + parent_classes = (parent['class'] || '').split + parent_lang_class = parent_classes.find { |c| c.start_with?('language-', 'lang-') } + if parent_lang_class + return parent_lang_class.sub(/\A(?:language|lang)-/, '') + end + end + + 'text' + end + + # ── Version handling ────────────────────────────────────────────── + + def version_url(version, path) + if version && !version.empty? + "#{self.class.base_url}#{path}@#{version}" + else + "#{self.class.base_url}#{path}" + end + end + + def parse_version_from_url(url) + match = url.match(/@([\d.]+)/) + match ? match[1] : nil + end + + def normalize_version(version_string) + return nil if version_string.nil? || version_string.empty? + + # Strip leading 'v' if present + cleaned = version_string.sub(/\Av/, '') + + # Validate semver-like format + parts = cleaned.split('.') + return nil unless parts.length.between?(1, 3) + return nil unless parts.all? { |p| p.match?(/\A\d+\z/) } + + cleaned + end end end diff --git a/lib/tasks/sprites.thor b/lib/tasks/sprites.thor index 54df982f07..d12d314585 100644 --- a/lib/tasks/sprites.thor +++ b/lib/tasks/sprites.thor @@ -222,7 +222,7 @@ class SpritesCLI < Thor scss_erb_files.each do |erb_path| scss_path = erb_path.gsub('.erb', '') File.open(scss_path, 'w') do |f| - f.write(ERB.new(File.open(erb_path).read).result) + f.write(ERB.new(File.read(erb_path)).result) logger.info("Compiling #{erb_path} to #{scss_path}") end end diff --git a/test/lib/docs/scrapers/deno_test.rb b/test/lib/docs/scrapers/deno_test.rb new file mode 100644 index 0000000000..c497f1813e --- /dev/null +++ b/test/lib/docs/scrapers/deno_test.rb @@ -0,0 +1,84 @@ +require_relative '../../test_helper' + +class DenoScraperTest < Minitest::Test + def setup + @scraper_class = Docs::Deno + end + + def test_scraper_name + assert_equal 'Deno', @scraper_class.name + end + + def test_scraper_type + assert_equal 'deno', @scraper_class.type + end + + def test_base_url + assert_equal 'https://docs.deno.com/', @scraper_class.base_url + end + + def test_root_path + assert_equal 'api/', @scraper_class.root_path + end + + def test_initial_paths_present + assert_kind_of Array, @scraper_class.initial_paths + refute_empty @scraper_class.initial_paths + assert_includes @scraper_class.initial_paths, 'api/' + assert_includes @scraper_class.initial_paths, 'runtime/' + end + + def test_links_defined + links = @scraper_class.links + assert_kind_of Hash, links + assert links.key?(:home) + assert links.key?(:code) + assert_match %r{\Ahttps://}, links[:home] + assert_match %r{github\.com}, links[:code] + end + + def test_only_patterns_defined + patterns = @scraper_class.options[:only_patterns] + assert_kind_of Array, patterns + refute_empty patterns + assert patterns.any? { |p| p.is_a?(Regexp) } + end + + def test_skip_patterns_excludes_blog + patterns = @scraper_class.options[:skip_patterns] + assert_kind_of Array, patterns + assert patterns.any? { |p| 'blog/foo' =~ p } + end + + def test_skip_patterns_excludes_deploy + patterns = @scraper_class.options[:skip_patterns] + assert patterns.any? { |p| 'deploy/docs' =~ p } + end + + def test_attribution_present + attribution = @scraper_class.options[:attribution] + assert_kind_of String, attribution + refute_empty attribution.strip + assert_match(/Deno/, attribution) + end + + def test_has_versions + versions = @scraper_class.versions + refute_nil versions + refute_empty versions + end + + def test_module_categories_frozen + categories = Docs::Deno::MODULE_CATEGORIES + assert categories.frozen? + assert_kind_of Hash, categories + assert categories.key?('Deno') + assert categories.key?('Web APIs') + assert categories.key?('File System') + assert categories.key?('Network') + end + + def test_inherits_from_url_scraper + assert @scraper_class < Docs::UrlScraper + end +end