Public
Snippet $137 authored by isaak yansane-sisk

yankem.py

Edited
yankem.py
import sys
import logging
import requests
import zlib
from requests.auth import HTTPBasicAuth
from HTMLParser import HTMLParser

#
# Simple example of pulling log data from apache frontend.
#

logging.basicConfig(filename='logyanker.log', level=logging.DEBUG)
log = logging.getLogger()

class LinkParser(HTMLParser):
    """
    Class to parse links from webpage.
    """
    def __init__(self):
         HTMLParser.__init__(self)
         self.links = []

    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            href = filter(lambda x: 'href' in x, attrs)
            link = map(lambda x: x[1], href)
            self.links.extend(link)


def download_page(url, user, passwd):
    """
    Downloads website content.
    input:
        url (str): a valid url
    output:
        content (str): the content of the web page at given url 
    """
    response = requests.get(url, auth=HTTPBasicAuth(user, passwd),verify=False)
    content = response.content if response.status_code < 400 else ''
  
    if '.gz' in url:
        content = decompress(content)
        return content
    
    return content


def decompress(data):
    """
    Decompresses gzip'ed file
    """
    decompressobj = zlib.decompressobj(16 + zlib.MAX_WBITS)
    data = decompressobj.decompress(data)
    return data

if __name__ == '__main__':
    apachefile = open('apache.log', 'w')
    user = 'YOUR-USERNAME'
    passwd = 'YOUR-PASSWORD'
    url = 'APACHE-FRONTEND-URI'
    content = download_page(url, user, passwd)
    parser = LinkParser()
    parser.feed(content)
    for link in parser.links:
        if link == '/': continue
        data = download_page(url + '/' + link)
        log.info("Sending %d bytes to ingestion.", sys.getsizeof(data))
        # write data to file
        apachefile.write(data)