Hack 39. Meander Your Google Neighborhood

Web by building a "neighborhood" of sites around a URL .It's called the World Wide Web, not the World Wide
Straight Line. Sites link to other sites, building a
web of sites. And what a tangled web we weave.Google Neighborhood by the
Python-wise Mark Pilgrim
(http://diveintomark.org)
attempts to detangle some small portion of the Web by using the
Google API to find sites related to a URL that you provide, scraping
the links on the sites returned and building a
"neighborhood" of sites that link
both the original URL and each other.If you'd like to give this hack a whirl without
having to run it yourself, there's a live version
available at http://diveintomark.org/archives/2002/06/04/who_are_the_people_in_your_neighborhood.
The source code (included in the following section) for Google
Neighborhood is available for download from http://diveintomark.org/projects/misc/neighbor.py.txt.
2.21.1. The Code
Google Neighborhood is written in the Python (http://www.python.org) programming language.
Your system will need to have Python installed for you to run this
hack. ""Blogroll finder and aggregator""
__author_ _ = "Mark Pilgrim (f8dy@diveintomark.org)"
__copyright_ _ = "Copyright 2002, Mark Pilgrim"
__license_ _ = "Python"
try:
import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
timeoutsocket.setDefaultSocketTimeout(10)
except:
pass
import urllib, urlparse, os, time, operator, sys, pickle, re, cgi, time
from sgmllib import SGMLParser
from threading import *
BUFFERSIZE = 1024
IGNOREEXTS = ('.xml', '.opml', '.rss', '.rdf', '.pdf', '.doc')
INCLUDEEXTS = ('', 'l', '', '.shtml', '.php', '.asp', '.jsp')
IGNOREDOMAINS = ('cgi.alexa.com', 'adserver1.backbeatmedia.com', 'ask.slashdot.org',
'freshmeat.net', 'readroom.ipl.org', 'amazon.com', 'ringsurf.com')
def prettyURL(url):
protocol, domain, path, params, query, fragment = urlparse.urlparse(url)
if path == '/':
path = ''
return urlparse.urlunparse(('', domain, path, '', '', '')).replace('//', '')
def simplifyURL(url):
url = url.replace('www.', '')
url = url.replace('/comingl', '/')
protocol, domain, path, params, query, fragment = urlparse.urlparse(url)
if path == '':
url = url + '/'
return url
class MinimalURLOpener(urllib.FancyURLopener):
def __init_ _(self, *args):
apply(urllib.FancyURLopener.__init_ _, (self,) + args)
self.addheaders = [('User-agent', '')]
def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
pass
class BlogrollParser(SGMLParser):
def __init_ _(self, url):
SGMLParser.__init_ _(self)
self.url = url
self.reset( )
def reset(self):
SGMLParser.reset(self)
self.possible = []
self.blogroll = []
self.ina = 0
def _goodlink(self, href):
protocol, domain, path, params, query, fragment = urlparse.urlparse(href)
if protocol.lower( ) <> 'http': return 0
if self.url.find(domain) <> -1: return 0
if domain in IGNOREDOMAINS: return 0
if domain.find(':5335') <> -1: return 0
if domain.find('.google') <> -1: return 0
if fragment: return 0
shortpath, ext = os.path.splitext(path)
ext = ext.lower( )
if ext in INCLUDEEXTS: return 1
if ext.lower( ) in IGNOREEXTS: return 0
# more rules here?
return 1
def _confirmpossibles(self):
if len(self.possible) >= 4:
for url in self.possible:
if url not in self.blogroll:
self.blogroll.append(url)
self.possible = []
def start_a(self, attrs):
self.ina = 1
hreflist = [e[1] for e in attrs if e[0]=='href']
if not hreflist: return
href = if self._goodlink(href):
self.possible.append(href)
def end_a(self):
self.ina = 0
def handle_data(self, data):
if self.ina: return
if data.strip( ):
self._confirmpossibles( )
def end_html(self, attrs):
self.confirmpossibles( )
def getRadioBlogroll(url):
try:
usock = MinimalURLOpener( ).open('/gems/mySubscriptions.opml' % url)
opmlSource = usock.read( )
usock.close( )
except:
return []
if opmlSource.find('<opml') == -1: return []
radioBlogroll = []
start = 0
while 1:
p = opmlSource.find('htmlUrl="', start)
if p == -1: break
refurl = opmlSource[p:p+100].split('"')[1]
radioBlogroll.append(refurl)
start = p + len(refurl) + 10
return radioBlogroll
def getBlogroll(url):
if url[:7] <> 'http://':
url = 'http://' + url
radioBlogroll = getRadioBlogroll(url)
if radioBlogroll:
return radioBlogroll
parser = BlogrollParser(url)
try:
usock = MinimalURLOpener( ).open(url)
htmlSource = usock.read( )
usock.close( )
except:
return []
parser.feed(htmlSource)
return parser.blogroll
class BlogrollThread(Thread):
def __init_ _(self, master, url):
Thread.__init_ _(self)
self.master = master
self.url = url
def run(self):
self.master.callback(self.url, getBlogroll(self.url))
class BlogrollThreadMaster:
def __init_ _(self, url, recurse):
self.blogrollDict = {}
self.done = 0
if type(url)==type(''):
blogroll = getBlogroll(url)
else:
blogroll = url
self.run(blogroll, recurse)
def callback(self, url, blogroll):
if not self.done:
self.blogrollDict[url] = blogroll
def run(self, blogroll, recurse):
start = 0
end = 5
while 1:
threads = []
for url in blogroll[start:end]:
if not self.blogrollDict.has_key(url):
t = BlogrollThread(self, url)
threads.append(t)
for t in threads:
t.start( )
time.sleep(0.000001)
for t in threads:
time.sleep(0.000001)
t.join(10)
start += 5
end += 5
if start > len(blogroll): break
if recurse > 1:
masterlist = reduce(operator.add, self.blogrollDict.values( ))
newlist = [url for url in masterlist if not self.blogrollDict. has_key(url)]
self.run(newlist, recurse - 1)
else:
self.done = 1
def sortBlogrollData(blogrollDict):
sortD = {}
for blogroll in blogrollDict.values( ):
for url in blogroll:
sortD[url] = sortD.setdefault(url, 0) + 1
sortI = [(v, k) for k, v in sortD.items( )]
sortI.sort( )
sortI.reverse( )
return sortI
def trimdata(sortI, cutoff):
return [(c, url) for c, url in sortI if c >= cutoff]
def getRelated(url):
import google
results = []
start = 0
for i in range(3):
data = google.doGoogleSearch('related:' % url, start)
results.extend([oneResult.URL for oneResult in data.results])
start += 10
if len(data.results) < 10: break
return results
def getNeighborhood(baseURL):
relatedList = getRelated(baseURL)
blogrollDict = BlogrollThreadMaster(relatedList, 1).blogrollDict
neighborhood = sortBlogrollData(blogrollDict)
neighborhood = trimdata(neighborhood, 2)
neighborhood = [(c,url, prettyURL(url)) for c,url in neighborhood]
return neighborhood
def render_html(baseURL, data):
output = []
output.append(""
<table class="socialnetwork" summary="neighborhood for ">
<caption>Neighborhood for </caption>
<thead>
<tr>
<th scope="col">Name</th>
<th scope="col">Links</th>
<th shope="col">Explore</th>
</tr>
</thead>
<tbody>"" % (cgi.escape(prettyURL(baseURL)), cgi.escape(prettyURL(baseURL))))
for c, url, title in data:
output.append(""<tr><td><a href="></a></td>
<td></td><td><a href=">explore</a></td></tr
>"" % (url, title, c, 'http://diveintomark.org/cgi-bin/neighborhood.cgi?url=' %
cgi.escape(url)))
output.append(""
</tbody>
</table>"")
return ".join(output)
def render_rss(baseURL, data):
title = prettyURL(baseURL)
channeltitle = " neighborhood" % title
localtime = time.strftime('%Y-%m-%dT%H:%M:%S-05:00', time.localtime( ))
output = []
output.append(""<?xml version="1.0"?>
<rdf:RDF xmlns="http://purl.org/rss/1.0/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/
elements/1.1/" xmlns:sy="http://purl.org/rss/1.0/modules/syndication/" xmlns:admin=
"http://webns.net/mvcb/">
<channel rdf:about="%(baseURL)s">
<title>%(channeltitle)s</title>
<link>%(baseURL)s</link>
<description>Sites in the virtual neighborhood of %(title)s</description>
<language>en-us</language>
<lastBuildDate>%(localtime)s</lastBuildDate>
<pubDate>%(localtime)s</pubDate>
<admin:generatorAgent rdf:resource="http://divintomark.org/cgi-bin/neighborhood.cgi/
?v=1.1" />
<admin:errorReportsTo rdf:resource="mailto:f8dy@diveintomark.org"/>
<sy:updatePeriod>weekly</sy:updatePeriod>
<sy:updateFrequency>1</sy:updateFrequency>
<sy:updateBase>2000-01-01T12:00+00:00</sy:updateBase>
<items>
<rdf:Seq>
"" % locals( ))
##""
for c, url, title in data:
output.append(""<rdf:li rdf:resource=" />
"" % url)
output.append(""</rdf:Seq>
</items>
</channel>
"")
for c, url, title in data:
output.append(""<item rdf:about="%(url)s">
<title>%(title)s</title>
<link>%(url)s</link>
<description>%(c)s links</description>
</item>
"" % locals( ))
output.append(""</rdf:RDF>"")
return ".join(output)
if __name__ == '__main_ _':
print render_html(getNeighborhood(sys.argv[1])) You'll also need an HTML form to call the
neighborhood.cgi script. Here's
a simple one: <form action="/cgi-bin/neighborhood.cgi" method="get">
URL: <input name="url" type="text" />
<br />
Output as: <input name="fl" type="radio" value="html" checked="true" /> HTML
<input name="fl" type="radio" value="rss" checked="true" /> RSS
<br />
<input type="submit" value="Meander" />
</form> Save the form as neighborhoodl, being sure
to alter the action= to point at the location in
which you installed the CGI script ["How to Run the
Scripts" in the Preface].
2.21.2. Running the Hack
Point your browser at the location of the form you saved just a
moment ago. Provide it with the URL that you're
interested in using as the center, select HTML or RSS output, and hit
the Meander button.Figure 2-13 shows a representation of
Rael's ( raelity.org 's,
to be precise) Google Neighborhood. Clicking on any of the links on
the left transports you to the URL shown. More interestingly, the
"explore" link shifts your point of
view, centering the neighborhood on the associated URL. You can thus
meander a neighborhood to your heart's content;
don't be surprised, especially in the blogging
world, if you keep coming across the same links. Speaking of links,
the number listed beneath the
"Links" heading represents the
number of links the associated site has to the currently focused
site.
Figure 2-13. raelity.org's Google Neighborhood

2.21.3. Hacking the Hack
If you want to hack this hack, concentrate your efforts on a small
block of code, specifying what file extensions you want to include
and exclude, as well as what domains you want to exclude when
calculating your neighborhoods: IGNOREEXTS = ('.xml', '.opml', '.rss', '.rdf', '.pdf', '.doc')
INCLUDEEXTS = ('', 'l', '', '.shtml', '.php', '.asp', '.jsp')
IGNOREDOMAINS = ('cgi.alexa.com', 'adserver1.backbeatmedia.com', 'ask.
slashdot.org','freshmeat.net', 'readroom.ipl.org', 'amazon.com',
'ringsurf.com')
2.21.3.1 Noticing/ignoring file extensions
The way the hack is currently written, the neighborhood is built
around pretty standard files. However, you could create a
neighborhood of sites served by PHP (http://www.php.net), including only URLs with
a PHP (.php) extension. Or perhaps your interest
lies in Word documents and PDF files. You'd alter
the code as follows: IGNOREEXTS = ('.xml', '.opml', '.rss', '.rdf', 'l', '', '.shtml',
'.php', '.asp', '.jsp')
INCLUDEEXTS = ('', ' .pdf', '.doc '
)
2.21.3.2 Ignoring domains
Sometimes, when building a neighborhood, you might notice that the
same links are popping up again and again. They're
not really part of the neighborhood but tend to be places that the
web pages making up your neighborhood often link to. For example,
most Blogger-based weblogs include a link to http://www.blogger.com as a matter of course.Exclude domains that hold no interest to you by adding them to the
IGNOREDOMAINS list: IGNOREDOMAINS = ('cgi.alexa.com', 'adserver1.backbeatmedia.com',
'ask.slashdot.org', 'freshmeat.net', 'readroom.ipl.org', 'amazon.com',
'ringsurf.com', 'blogger.com')