From 8906cabae7a6be44cafcedcaf27352614fcc462b Mon Sep 17 00:00:00 2001
From: Douwe Maan <douwe@gitlab.com>
Date: Tue, 18 Aug 2015 17:02:26 -0700
Subject: [PATCH] Changes and stuff.

---
 Gemfile                              |   2 +
 Gemfile.lock                         |   2 +
 app/mailers/notify.rb                |  10 +-
 app/models/sent_notification.rb      |   8 +-
 app/workers/email_receiver_worker.rb |   2 +-
 config/initializers/1_settings.rb    |   2 +-
 lib/gitlab/email_html_cleaner.rb     | 133 +++++++++++++++++++++++++
 lib/gitlab/email_receiver.rb         | 144 +++++++++++++++++++++++----
 lib/gitlab/reply_by_email.rb         |  47 +++++++++
 9 files changed, 320 insertions(+), 30 deletions(-)
 create mode 100644 lib/gitlab/email_html_cleaner.rb
 create mode 100644 lib/gitlab/reply_by_email.rb

diff --git a/Gemfile b/Gemfile
index e3f76671f5..9879141f5c 100644
--- a/Gemfile
+++ b/Gemfile
@@ -274,3 +274,5 @@ gem "newrelic_rpm"
 gem 'octokit', '3.7.0'
 
 gem "mail_room", github: "DouweM/mail_room", branch: "sidekiq"
+
+gem 'email_reply_parser'
diff --git a/Gemfile.lock b/Gemfile.lock
index 597eb4cde0..629d128b42 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -163,6 +163,7 @@ GEM
     dotenv (0.9.0)
     dropzonejs-rails (0.7.1)
       rails (> 3.1)
+    email_reply_parser (0.5.8)
     email_spec (1.6.0)
       launchy (~> 2.1)
       mail (~> 2.2)
@@ -780,6 +781,7 @@ DEPENDENCIES
   diffy (~> 3.0.3)
   doorkeeper (= 2.1.3)
   dropzonejs-rails
+  email_reply_parser
   email_spec (~> 1.6.0)
   enumerize
   factory_girl_rails
diff --git a/app/mailers/notify.rb b/app/mailers/notify.rb
index 44df3d6407..c2ea99d968 100644
--- a/app/mailers/notify.rb
+++ b/app/mailers/notify.rb
@@ -146,7 +146,7 @@ class Notify < ActionMailer::Base
 
     if reply_key
       headers['X-GitLab-Reply-Key'] = reply_key
-      headers['Reply-To'] = Gitlab.config.reply_by_email.address.gsub('%{reply_key}', reply_key)
+      headers['Reply-To'] = Gitlab::ReplyByEmail.reply_address(reply_key)
     end
 
     mail(headers)
@@ -165,6 +165,10 @@ class Notify < ActionMailer::Base
     headers['In-Reply-To'] = message_id(model)
     headers['References'] = message_id(model)
 
+    if headers[:subject]
+      headers[:subject].prepend('Re: ')
+    end
+
     mail_new_thread(model, headers)
   end
 
@@ -173,8 +177,6 @@ class Notify < ActionMailer::Base
   end
 
   def reply_key
-    return nil unless Gitlab.config.reply_by_email.enabled
-
-    @reply_key ||= SecureRandom.hex(16)
+    @reply_key ||= Gitlab::ReplyByEmail.reply_key
   end
 end
diff --git a/app/models/sent_notification.rb b/app/models/sent_notification.rb
index a3d24669b5..23a1b19ea7 100644
--- a/app/models/sent_notification.rb
+++ b/app/models/sent_notification.rb
@@ -6,8 +6,8 @@ class SentNotification < ActiveRecord::Base
   validate :project, :recipient, :reply_key, presence: true
   validate :reply_key, uniqueness: true
 
-  validates :noteable_id, presence: true, if: ->(n) { n.noteable_type.present? && n.noteable_type != 'Commit' }
-  validates :commit_id, presence: true, if: ->(n) { n.noteable_type == 'Commit' }
+  validates :noteable_id, presence: true, unless: :for_commit?
+  validates :commit_id, presence: true, if: :for_commit?
 
   def self.for(reply_key)
     find_by(reply_key: reply_key)
@@ -19,11 +19,9 @@ class SentNotification < ActiveRecord::Base
 
   def noteable
     if for_commit?
-      project.commit(commit_id)
+      project.commit(commit_id) rescue nil
     else
       super
     end
-  rescue
-    nil
   end
 end
diff --git a/app/workers/email_receiver_worker.rb b/app/workers/email_receiver_worker.rb
index e44a430f6b..94e346b5a5 100644
--- a/app/workers/email_receiver_worker.rb
+++ b/app/workers/email_receiver_worker.rb
@@ -4,7 +4,7 @@ class EmailReceiverWorker
   sidekiq_options queue: :incoming_email
 
   def perform(raw)
-    return unless Gitlab.config.reply_by_email.enabled
+    return unless Gitlab::ReplyByEmail.enabled?
 
     # begin
       Gitlab::EmailReceiver.new(raw).process
diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb
index 9e83660e10..654de6238d 100644
--- a/config/initializers/1_settings.rb
+++ b/config/initializers/1_settings.rb
@@ -153,7 +153,7 @@ Settings.gitlab['restricted_signup_domains'] ||= []
 # Reply by email
 #
 Settings['reply_by_email'] ||= Settingslogic.new({})
-Settings.reply_by_email['enabled'] = false if Settings.gravatar['enabled'].nil?
+Settings.reply_by_email['enabled'] = false if Settings.reply_by_email['enabled'].nil?
 
 #
 # Gravatar
diff --git a/lib/gitlab/email_html_cleaner.rb b/lib/gitlab/email_html_cleaner.rb
new file mode 100644
index 0000000000..6d7a17fe87
--- /dev/null
+++ b/lib/gitlab/email_html_cleaner.rb
@@ -0,0 +1,133 @@
+# Taken mostly from Discourse's Email::HtmlCleaner
+module Gitlab
+  # HtmlCleaner cleans up the extremely dirty HTML that many email clients
+  # generate by stripping out any excess divs or spans, removing styling in
+  # the process (which also makes the html more suitable to be parsed as
+  # Markdown).
+  class EmailHtmlCleaner
+    # Elements to hoist all children out of
+    HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td)
+    # Node types to always delete
+    HTML_DELETE_ELEMENT_TYPES = [
+      Nokogiri::XML::Node::DTD_NODE,
+      Nokogiri::XML::Node::COMMENT_NODE,
+    ]
+
+    # Private variables:
+    #   @doc - nokogiri document
+    #   @out - same as @doc, but only if trimming has occured
+    def initialize(html)
+      if html.is_a?(String)
+        @doc = Nokogiri::HTML(html)
+      else
+        @doc = html
+      end
+    end
+
+    class << self
+      # EmailHtmlCleaner.trim(inp, opts={})
+      #
+      # Arguments:
+      #   inp - Either a HTML string or a Nokogiri document.
+      # Options:
+      #   :return => :doc, :string
+      #     Specify the desired return type.
+      #     Defaults to the type of the input.
+      #     A value of :string is equivalent to calling get_document_text()
+      #     on the returned document.
+      def trim(inp, opts={})
+        cleaner = EmailHtmlCleaner.new(inp)
+
+        opts[:return] ||= (inp.is_a?(String) ? :string : :doc)
+
+        if opts[:return] == :string
+          cleaner.output_html
+        else
+          cleaner.output_document
+        end
+      end
+
+      # EmailHtmlCleaner.get_document_text(doc)
+      #
+      # Get the body portion of the document, including html, as a string.
+      def get_document_text(doc)
+        body = doc.xpath('//body')
+        if body
+          body.inner_html
+        else
+          doc.inner_html
+        end
+      end
+    end
+
+    def output_document
+      @out ||= begin
+        doc = @doc
+        trim_process_node doc
+        add_newlines doc
+        doc
+      end
+    end
+
+    def output_html
+      EmailHtmlCleaner.get_document_text(output_document)
+    end
+
+    private
+
+    def add_newlines(doc)
+      # Replace <br> tags with a markdown \n
+      doc.xpath('//br').each do |br|
+        br.replace(new_linebreak_node doc, 2)
+      end
+      # Surround <p> tags with newlines, to help with line-wise postprocessing
+      # and ensure markdown paragraphs
+      doc.xpath('//p').each do |p|
+        p.before(new_linebreak_node doc)
+        p.after(new_linebreak_node doc, 2)
+      end
+    end
+
+    def new_linebreak_node(doc, count=1)
+      Nokogiri::XML::Text.new("\n" * count, doc)
+    end
+
+    def trim_process_node(node)
+      if should_hoist?(node)
+        hoisted = trim_hoist_element node
+        hoisted.each { |child| trim_process_node child }
+      elsif should_delete?(node)
+        node.remove
+      else
+        if children = node.children
+          children.each { |child| trim_process_node child }
+        end
+      end
+
+      node
+    end
+
+    def trim_hoist_element(element)
+      hoisted = []
+      element.children.each do |child|
+        element.before(child)
+        hoisted << child
+      end
+      element.remove
+      hoisted
+    end
+
+    def should_hoist?(node)
+      return false unless node.element?
+      HTML_HOIST_ELEMENTS.include? node.name
+    end
+
+    def should_delete?(node)
+      return true if HTML_DELETE_ELEMENT_TYPES.include? node.type
+      return true if node.element? && node.name == 'head'
+      return true if node.text? && node.text.strip.blank?
+
+      false
+    end
+  end
+end
diff --git a/lib/gitlab/email_receiver.rb b/lib/gitlab/email_receiver.rb
index a9f67bb5da..18a06d2ee8 100644
--- a/lib/gitlab/email_receiver.rb
+++ b/lib/gitlab/email_receiver.rb
@@ -1,44 +1,69 @@
+# Inspired in great part by Discourse's Email::Receiver
 module Gitlab
   class EmailReceiver
+    class ProcessingError < StandardError; end
+    class EmailUnparsableError < ProcessingError; end
+    class EmptyEmailError < ProcessingError; end
+    class UserNotFoundError < ProcessingError; end
+    class UserNotAuthorizedLevelError < ProcessingError; end
+    class NoteableNotFoundError < ProcessingError; end
+    class AutoGeneratedEmailError < ProcessingError; end
+    class SentNotificationNotFound < ProcessingError; end
+    class InvalidNote < ProcessingError; end
+
     def initialize(raw)
       @raw = raw
     end
 
     def message
       @message ||= Mail::Message.new(@raw)
+    rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError => e
+      raise EmailUnparsableError, e
     end
 
     def process
-      return unless message && sent_notification
+      raise EmptyEmailError if @raw.blank?
+
+      raise AutoGeneratedEmailError if message.header.to_s =~ /auto-(generated|replied)/
+
+      raise SentNotificationNotFound unless sent_notification
+
+      author = sent_notification.recipient
+
+      raise UserNotFoundError unless author
+
+      project = sent_notification.project
+
+      raise UserNotAuthorizedLevelError unless author.can?(:create_note, project)
+
+      raise NoteableNotFoundError unless sent_notification.noteable
+
+      body = parse_body(message)
 
-      Notes::CreateService.new(
-        sent_notification.project,
-        sent_notification.recipient,
-        note:           message.text_part.to_s,
+      note = Notes::CreateService.new(
+        project,
+        author,
+        note:           body,
         noteable_type:  sent_notification.noteable_type,
         noteable_id:    sent_notification.noteable_id,
         commit_id:      sent_notification.commit_id
       ).execute
+
+      unless note.persisted?
+        raise InvalidNote, note.errors.full_messages.join("\n")
+      end
     end
 
     private
 
     def reply_key
-      address = Gitlab.config.reply_by_email.address
-      return nil unless address
-
-      regex = Regexp.escape(address)
-      regex = regex.gsub(Regexp.escape('%{reply_key}'), "(.*)")
-      regex = Regexp.new(regex)
-
-      address = message.to.find { |address| address =~ regex }
-      return nil unless address
+      reply_key = nil
+      message.to.each do |address|
+        reply_key = Gitlab::ReplyByEmail.reply_key_from_address(address)
+        break if reply_key
+      end
 
-      match = address.match(regex)
-        
-      return nil unless match && match[1].present?
-
-      match[1]
+      reply_key
     end
 
     def sent_notification
@@ -46,5 +71,86 @@ module Gitlab
       
       SentNotification.for(reply_key)
     end
+
+    def parse_body(message)
+      body = select_body(message)
+
+      encoding = body.encoding
+      raise EmptyEmailError if body.strip.blank?
+
+      body = discourse_email_trimmer(body)
+      raise EmptyEmailError if body.strip.blank?
+
+      body = EmailReplyParser.parse_reply(body)
+      raise EmptyEmailError if body.strip.blank?
+
+      body.force_encoding(encoding).encode("UTF-8")
+    end
+
+    def select_body(message)
+      html = nil
+      text = nil
+
+      if message.multipart?
+        html = fix_charset(message.html_part)
+        text = fix_charset(message.text_part)
+      elsif message.content_type =~ /text\/html/
+        html = fix_charset(message)
+      end
+
+      # prefer plain text
+      return text if text
+
+      if html
+        body = EmailHtmlCleaner.new(html).output_html
+      else
+        body = fix_charset(message)
+      end
+
+      # Certain trigger phrases that means we didn't parse correctly
+      if body =~ /(Content\-Type\:|multipart\/alternative|text\/plain)/
+        raise EmptyEmailError
+      end
+
+      body
+    end
+
+    # Force encoding to UTF-8 on a Mail::Message or Mail::Part
+    def fix_charset(object)
+      return nil if object.nil?
+
+      if object.charset
+        object.body.decoded.force_encoding(object.charset.gsub(/utf8/i, "UTF-8")).encode("UTF-8").to_s
+      else
+        object.body.to_s
+      end
+    rescue
+      nil
+    end
+
+    REPLYING_HEADER_LABELS = %w(From Sent To Subject Reply To Cc Bcc Date)
+    REPLYING_HEADER_REGEX = Regexp.union(REPLYING_HEADER_LABELS.map { |label| "#{label}:" })
+
+    def discourse_email_trimmer(body)
+      lines = body.scrub.lines.to_a
+      range_end = 0
+
+      lines.each_with_index do |l, idx|
+        break if l =~ /\A\s*\-{3,80}\s*\z/ ||
+                 # This one might be controversial but so many reply lines have years, times and end with a colon.
+                 # Let's try it and see how well it works.
+                 (l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/) ||
+                 (l =~ /On \w+ \d+,? \d+,?.*wrote:/)
+
+        # Headers on subsequent lines
+        break if (0..2).all? { |off| lines[idx+off] =~ REPLYING_HEADER_REGEX }
+        # Headers on the same line
+        break if REPLYING_HEADER_LABELS.count { |label| l.include?(label) } >= 3
+
+        range_end = idx
+      end
+
+      lines[0..range_end].join.strip
+    end
   end
 end
diff --git a/lib/gitlab/reply_by_email.rb b/lib/gitlab/reply_by_email.rb
new file mode 100644
index 0000000000..b6157de361
--- /dev/null
+++ b/lib/gitlab/reply_by_email.rb
@@ -0,0 +1,47 @@
+module Gitlab
+  module ReplyByEmail
+    class << self
+      def enabled?
+        config.enabled &&
+          config.address &&
+          config.address.include?("%{reply_key}")
+      end
+
+      def reply_key
+        return nil unless enabled?
+
+        SecureRandom.hex(16)
+      end
+
+      def reply_address(reply_key)
+        config.address.gsub('%{reply_key}', reply_key)
+      end
+
+      def reply_key_from_address(address)
+        return unless address_regex
+
+        match = address.match(address_regex)
+        return unless match
+
+        match[1]
+      end
+
+      private
+
+      def config
+        Gitlab.config.reply_by_email
+      end
+
+      def address_regex
+        @address_regex ||= begin
+          wildcard_address = config.address
+          return nil unless wildcard_address
+
+          regex = Regexp.escape(wildcard_address)
+          regex = regex.gsub(Regexp.escape('%{reply_key}'), "(.+)")
+          Regexp.new(regex).freeze
+        end
+      end
+    end
+  end
+end
-- 
2.30.9