Commit 91db36b6 authored by Sean McGivern's avatar Sean McGivern

Merge branch 'merge-gitlab-elasticsearch-git' into 'master'

Import the gitlab-elasticsearch-git gem

Closes gitlab-elasticsearch-git#6

See merge request !1794
parents 9142821e c595bdc1
......@@ -113,7 +113,6 @@ gem 'seed-fu', '~> 2.3.5'
gem 'elasticsearch-model', '~> 0.1.9'
gem 'elasticsearch-rails', '~> 0.1.9'
gem 'elasticsearch-api', '5.0.3'
gem 'gitlab-elasticsearch-git', '1.2.0', require: "elasticsearch/git"
gem 'aws-sdk'
gem 'faraday_middleware-aws-signers-v4'
......
......@@ -287,14 +287,6 @@ GEM
mime-types (>= 1.19)
rugged (>= 0.23.0b)
github-markup (1.4.0)
gitlab-elasticsearch-git (1.2.0)
activemodel (~> 4.2)
activesupport (~> 4.2)
charlock_holmes (~> 0.7)
elasticsearch-api
elasticsearch-model (~> 0.1.9)
github-linguist (~> 4.7)
rugged (~> 0.24)
gitlab-flowdock-git-hook (1.0.1)
flowdock (~> 0.7)
gitlab-grit (>= 2.4.1)
......@@ -949,7 +941,6 @@ DEPENDENCIES
gemojione (~> 3.0)
gitaly (~> 0.5.0)
github-linguist (~> 4.7.0)
gitlab-elasticsearch-git (= 1.2.0)
gitlab-flowdock-git-hook (~> 1.0.1)
gitlab-license (~> 1.0)
gitlab-markup (~> 1.5.1)
......
......@@ -3,12 +3,14 @@
require 'rubygems'
require 'bundler/setup'
require 'json'
require 'elasticsearch/git'
require 'active_support'
require 'active_support/core_ext'
require 'benchmark'
require File.expand_path('../lib/gitlab/elastic/client', File.dirname(__FILE__))
$: << File.expand_path('../lib', File.dirname(__FILE__))
require 'gitlab/elastic/client'
require 'elasticsearch/git'
Thread.abort_on_exception = true
......
require "elasticsearch/git/model"
require "elasticsearch/git/repository"
module Elasticsearch
module Git
end
end
require 'active_support/concern'
require 'charlock_holmes'
module Elasticsearch
module Git
module EncoderHelper
extend ActiveSupport::Concern
included do
def encode!(message)
return nil unless message.respond_to? :force_encoding
# if message is utf-8 encoding, just return it
message.force_encoding("UTF-8")
return message if message.valid_encoding?
# return message if message type is binary
detect = CharlockHolmes::EncodingDetector.detect(message)
return message.force_encoding("BINARY") if detect && detect[:type] == :binary
# encoding message to detect encoding
if detect && detect[:encoding]
message.force_encoding(detect[:encoding])
end
# encode and clean the bad chars
message.replace clean(message)
rescue
encoding = detect ? detect[:encoding] : "unknown"
"--broken encoding: #{encoding}"
end
private
def clean(message)
message.encode("UTF-16BE", undef: :replace, invalid: :replace, replace: "")
.encode("UTF-8")
.gsub("\0".encode("UTF-8"), "")
end
end
end
end
end
require 'linguist'
require 'elasticsearch/git/encoder_helper'
module Elasticsearch
module Git
class LiteBlob
include Linguist::BlobHelper
include Elasticsearch::Git::EncoderHelper
attr_accessor :id, :name, :path, :size, :mode, :commit_id
attr_writer :data
def initialize(repo, raw_blob_hash)
@id = raw_blob_hash[:oid]
@blob = repo.lookup(@id)
@mode = raw_blob_hash[:mode].to_s(8)
@size = @blob.size
@path = encode!(raw_blob_hash[:path])
@name = @path.split('/').last
end
def data
@data ||= encode!(@blob.content)
end
end
end
end
require 'active_support/concern'
require 'active_model'
require 'elasticsearch/model'
module Elasticsearch
module Git
module Model
extend ActiveSupport::Concern
included do
extend ActiveModel::Naming
include ActiveModel::Model
include Elasticsearch::Model
env = if defined?(::Rails)
::Rails.env.to_s
else
nil
end
index_name [self.name.downcase, 'index', env].compact.join('-')
settings \
index: {
analysis: {
analyzer: {
path_analyzer: {
type: 'custom',
tokenizer: 'path_tokenizer',
filter: %w(lowercase asciifolding)
},
sha_analyzer: {
type: 'custom',
tokenizer: 'sha_tokenizer',
filter: %w(lowercase asciifolding)
},
code_analyzer: {
type: 'custom',
tokenizer: 'standard',
filter: %w(code lowercase asciifolding),
char_filter: ["code_mapping"]
},
code_search_analyzer: {
type: 'custom',
tokenizer: 'standard',
filter: %w(lowercase asciifolding),
char_filter: ["code_mapping"]
}
},
tokenizer: {
sha_tokenizer: {
type: "edgeNGram",
min_gram: 5,
max_gram: 40,
token_chars: %w(letter digit)
},
path_tokenizer: {
type: 'path_hierarchy',
reverse: true
},
},
filter: {
code: {
type: "pattern_capture",
preserve_original: 1,
patterns: [
"(\\p{Ll}+|\\p{Lu}\\p{Ll}+|\\p{Lu}+)",
"(\\d+)"
]
}
},
char_filter: {
code_mapping: {
type: "mapping",
mappings: [
". => ' '"
]
}
},
}
}
end
end
end
end
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment