Bluecoat ProxySG Cache Retrieval Script in Python

So, I was actually looking at this script today and thought folks who use Bluecoat as proxies at their jobs (I get the impression that they are pretty popular) might be interested in checking it out. It’s kind of like a poor-man’s pcap solution for sites that use a robust Bluecoat proxy but don’t have pcap instrumentation everywhere.

If you give this script a URI, and a list of Bluecoat proxies, and some credentials to those proxies, it essentially goes and grabs the URI, writes it to disk and includes some information on the last time it was modified on disk, etc. Sometimes, you can use this to retrieve malicious payload that is otherwise unavailable to you due to take-down by LE or replay-filtering by the adversary.

Print usage with –help, make sure you define your setup variables appropriately before you run it, and I hope you find it useful.

#!/usr/bin/env python
# creds: I wrote most of this, only thing I used for inspiration was this HTML table parser article:
# though honestly, his parser is much more feature-rich, his code taught me how the HTMLParser class works
# email me at mishley at-sign gmail dot com for cake and/or questions

import sys
import os
import urllib
from HTMLParser import HTMLParser
import optparse
import re
import time

# setup variables
default_proxies = [ "", "" ] # default list of proxies to use if -p is not provided
bluecoat_web_port = "3443" # web port to access bluecoat proxy web admin interface
bluecoat_web_user = "username" # username for above interface
bluecoat_web_pass = "password" # password for above interface
bluecoat_proxy_port = "3128" # proxy port to request that a proxy directly proxy a request, may also probably use 80

# parse command line args
parser = optparse.OptionParser()
parser.add_option("-u", "--uri", type="string", action="store", dest="uri", help="URI to retrieve. Must be a file object, not a directory.")
parser.add_option("-p", "--proxyip", type="string", action="append", dest="proxyip", help="Proxy IP addresses to search (defaults to all Bluecoats), can be used multiple times for multiple IP addresses. (if used more than once, --all is assumed)")
parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="Write file object metadata to log file, <filename>.log.")
parser.add_option("-a", "--all", dest="all", action="store_true", default=False, help="Grab a copy of the file from every proxy on which it is found, not just the first in the list. These files may be identical, use md5sum to check.")
options, args = parser.parse_args()

# input validation
if len(sys.argv) == 1:
if options.proxyip and len(options.proxyip) > 1:
	options.all = True
if not options.proxyip:
	options.proxyip = default_proxies
	for i in options.proxyip:
		if'[^0-9\.]', i):
			parser.error("Option --proxyip must use a valid IP address, exiting.")
if not options.uri:
	parser.error("Option --uri is required for use, exiting.")

class proxyopen(urllib.FancyURLopener):
	def prompt_user_passwd(self, host, realm):
		return bluecoat_web_user, bluecoat_web_pass
	def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
		"""Error 401 -- authentication required. This function supports Basic authentication only."""
		self.tries += 1
		if self.maxtries and self.tries >= self.maxtries:
			self.tries = 0
			return self.http_error_default(url, fp, 500, "HTTPS Basic Auth timed out after "+str(self.maxtries)+" attempts.", headers)
		if not 'www-authenticate' in headers:
			URLopener.http_error_default(self, url, fp, errcode, errmsg, headers)
		stuff = headers['www-authenticate']
		import re
		match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
		if not match:
			URLopener.http_error_default(self, url, fp, errcode, errmsg, headers)
		scheme, realm = match.groups()
		if scheme.lower() != 'basic':
			URLopener.http_error_default(self, url, fp, errcode, errmsg, headers)
		name = 'retry_' + self.type + '_basic_auth'
		if data is None:
			return getattr(self,name)(url, realm)
			self.tries = 0
			return getattr(self,name)(url, realm, data)

def checkURI(uri="", proxyip=""):
	opener = proxyopen()
	protocol, domainandpath = uri.split('//')
	protocol = protocol.rstrip(':')
	if protocol != 'http':
		sys.exit("Cannot process non-http requests, exiting.")
	try: page ="https://" + proxyip + ":" + bluecoat_web_port + "/CE/Info/" + protocol + "/" + domainandpath).read()
	except: return "NOCONN_0xDEADBEEF"
	if page.find('Authentication required') > -1: return "NOAUTH_0xDEADBEEF"
	if page.find('0x00000007') == -1 and page.find('CE URL Information') > -1: return page
	else: return "NOTFOUND_0xDEADBEEF"

def fdURI(uri="", proxyip=""):
	proxy = { 'http': 'http://'+proxyip+':'+bluecoat_proxy_port }
	fd = urllib.urlopen(uri, proxies=proxy)
	return fd

class parseTable(HTMLParser):
	def __init__(self):
		self.in_table = 0
		self.in_tr = 0
		self.in_td = 0
		self.tabledata = []
	def handle_starttag(self, tag, attrs):
		if tag == 'table': self.in_table = 1
		if tag == 'tr': self.in_tr = 1
		if tag == 'td': self.in_td = 1
	def handle_data(self, data):
		if self.in_td and self.in_tr and self.in_table:
	def handle_endtag(self, tag):
		if tag == 'table': self.in_table = 0
		if tag == 'tr': self.in_tr = 0
		if tag == 'td': self.in_td = 0

if __name__ == "__main__":
	filename = options.uri.split('/')[-1]
	for proxy in options.proxyip:
		meta = checkURI(options.uri, proxy)
		if meta == "NOCONN_0xDEADBEEF":
			print "Unable to connect to proxy "+proxy+" via urllib to find URL '"+options.uri+"'."
		elif meta == "NOTFOUND_0xDEADBEEF":
			print "Unable to locate URL '"+options.uri+"' in proxy "+proxy+"."
		elif meta == "NOAUTH_0xDEADBEEF":
			print "Unable to authenticate to proxy "+proxy+"."
			fd = fdURI(options.uri, proxy)
			outstring =
			# we are going to re-grab meta data now that we've potentially
			# modified the last-cached timestamp
			meta = checkURI(options.uri, proxy)
			tableparser = parseTable()
			parsed = tableparser.tabledata
			tableparser = None
			lastretrieved = time.strftime("%Y%m%d_%H:%M:%S_UTC", time.strptime(' '.join(parsed[9].split()[2:4]), "%m/%d/%Y %H:%M:%S"))
			fullname = filename+"_"+proxy+"_"+lastretrieved
			outfile = open(fullname, 'wb')
			print "Downloaded file '"+fullname+"' successfully."
			if options.log:
				logfile = open(fullname+".log", 'wb')
				j = 0
				for i in parsed:
					j = j + 1
					if j % 2 == 0: logfile.write(i+"\n")
					else: logfile.write(i+" :: ")
				print "Successfully wrote metadata to file '"+fullname+".log'."
			if options.all: continue
			else: break

One thought on “Bluecoat ProxySG Cache Retrieval Script in Python

  1. Great blog here! Also your web site loads up very fast!
    What host are you using? Can I get your affiliate link to your host?

    I wish my web site loaded up as quickly as yours lol

Leave a Reply

Fill in your details below or click an icon to log in: Logo

You are commenting using your account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )


Connecting to %s