# frozen_string_literal: true
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: BSD-3-Clause
require "jekyll/hooks"
require "jekyll/document"
require "json"
##
# This singleton facilitates production of an indexable JSON representation of the content to populate a data source
# to provide search functionality.
#
# To prevent the indexing of a document, include `omit_from_search: true` in the document's front matter.
# In rare circumstances, this indexer fails to find the title of some documents; including `primary_title` solves that.
#
# The plugin does not apply changes when the Serve command is used but `JEKYLL_ALLOW_CONTENT_INDEXER`, set on the
# environment, will override the behavior.
# Usage: `JEKYLL_ALLOW_CONTENT_INDEXER= bundle exec jekyll serve --trace`
module Jekyll::ContentIndexer
##
# The collection that will get stores as the output
@data = []
##
# Pattern to identify documents that should be excluded based on their URL
@excluded_paths = /(^\/blog\/(page\d|$)|^\/events\/([^\d]|$)|^\/faqs|\.(css|js|json|map|xml|txt|yml)$|\/404\.html)/i.freeze
##
# Pattern to identify block HTML tags (not comprehensive)
@html_block_tags = /\s*<[?\/]?(article|blockquote|d[dlt]|div|fieldset|form|h|li|main|nav|[ou]l|p|section|table|t[rd]).*?>\s*/im.freeze
##
# Pattern to identify certain HTML tags whose content should be excluded from indexing
@html_excluded_tags = /\s*<(head|style|script|h1).*?>.*?<\/\1>/im.freeze
##
# Pattern to extract the page heading from div.copy-banner > div.container > h1 > a
@header_matcher = /
.*?.*?\s*([^<]+)\s*<\/a>.*?<\/h1>/m.freeze;
##
# Defines the priority of the plugin
# The hooks are registered with the lowest possible priority to make sure they run after any other
def self.priority
1
end
##
# Initializes the singleton by recording the site
def self.init(site)
@site = site
# Avoid initializing if serving and not forced to run
if site.config["serving"] and (!ENV.key?('JEKYLL_ALLOW_CONTENT_INDEXER') or ENV['JEKYLL_ALLOW_CONTENT_INDEXER'] == "false")
return Jekyll.logger.info "ContentIndexer:",
"disabled. Enable with JEKYLL_ALLOW_CONTENT_INDEXER on the environment"
end
# Process a Page as soon as its content is ready
Jekyll::Hooks.register :pages, :post_convert, priority:self.priority do |page|
self.add(page)
end
# Process a Document as soon as its content is ready
Jekyll::Hooks.register :documents, :post_convert, priority:self.priority do |document|
self.add(document)
end
# Save the produced collection after Jekyll is done writing all its stuff
Jekyll::Hooks.register :site, :post_write, priority:self.priority do |_|
self.save()
end
Jekyll.logger.info "ContentIndexer:", "initialized"
end
##
# Processes a Document or Page and adds it to the collection
def self.add(page)
return if @excluded_paths.match(page.url)
return if page.data['omit_from_search']
content = page.content
.gsub(@html_excluded_tags, ' ') # Strip certain HTML blocks
.gsub(@html_block_tags, "\n") # Strip some block HTML tags, replacing with newline
.gsub(/\s*<[?\/!]?[a-z]+.*?>\s*/im, ' ') # Strip all remaining HTML tags
.gsub(/\s*[\r\n]+\s*/, "\n") # Clean line-breaks
.gsub(/\s{2,}/, ' ') # Trim long spaces
.gsub(/\s+([.:;,)!\]?])/, '\1') # Remove spaces before some punctuations
.strip # Trim leading and tailing whitespaces
return if content.empty?
url = @site.config["baseurl"] + page.url
type = nil
if page.instance_of?(Jekyll::Document)
# Appropriately assign types based on collection
case page.collection&.label
when 'posts'
type = 'News'
when 'authors'
type = 'Authors'
when 'events'
type = 'Events'
when 'versions'
type = 'Downloads'
when 'testimonials'
type = 'Testimonials'
when 'tutorials'
type = 'Tutorials'
#url << '.html' # Add .html to URLs of author pages to correct the url
else
puts 'Unknown type: ' + page.collection&.label
end
end
# Produce keywords
keywords = []
keywords += page.data["categories"] unless page.data["categories"].nil? || page.data["categories"]&.empty?
keywords += page.data["keywords"] unless page.data["keywords"].nil? || page.data["keywords"]&.empty?
title = page.data["title"]
title = page.data["primary_title"] if title.nil? || title.empty?
if title.nil? || title.empty?
# Page might be using context variables to set `primary_title`
if /.*?.*?\s*([^<]+)\s*<\/a>.*?<\/h1>/m =~ page.content
title = "#{$1}"
end
end
data = {
url: url,
title: title,
content: content,
keywords: keywords,
type: type
}
@data.push(data)
end
##
# Saves the collection as a JSON file
def self.save
File.open(File.join(@site.config["destination"], "search-index.json"), 'w') do |f|
f.puts JSON.pretty_generate(@data)
end
end
end
# Before any Document or Page is processed, initialize the ContentIndexer
Jekyll::Hooks.register :site, :pre_render, priority:Jekyll::ContentIndexer.priority do |site|
Jekyll::ContentIndexer.init(site)
end