A simple key-value and distributed filesystem spider by Python, Hbase, Hadoop

tested on:

Linux: CentOS 5.3
Python: 2.5
Hadoop: 0.20.1
Hbase: 0.20.0

from unittest import TestCase, main
from thrift import Thrift
from thrift.transport import TSocket
from thrift.transport import TTransport
from thrift.protocol import TBinaryProtocol

from hbase import Hbase
from hbase.ttypes import ColumnDescriptor, Mutation, BatchMutation, NotFound
class HbaseWriter:

    def __init__(self, netloc, port, table="webpages"):
        self.tableName = table

        self.transport = TTransport.TBufferedTransport(
            TSocket.TSocket(netloc, port))
        self.protocol = TBinaryProtocol.TBinaryProtocol(self.transport)
        self.client = Hbase.Client(self.protocol)
        self.transport.open()

        tables = self.client.getTableNames()
        if self.tableName not in tables:
            self.__createTable()

    def __del__(self):
        self.transport.close()

    def __createTable(self):
        self.client.createTable(self.tableName,
                                [ColumnDescriptor(name="contents:", maxVersions=1, compression="BLOCK"),])

    def reverseUrl(self, url):
        link = filter(None, url.split("//"))[-1]
        hops = filter(None, link.split("/"))
        domain = hops[0].split(".")
        domain.reverse()
        domain = '.'.join(domain)
        hops[0] = domain
        return '/'.join(hops)

    def write(self, url, content):
        row = self.reverseUrl(url)
        mutations = [Mutation(column="contents:", value=content)]
        self.client.mutateRow(self.tableName, row, mutations)


class TestHbaseWriter(TestCase):
    def setUp(self):
        self.writer = HbaseWriter("192.168.0.1", 9090, "test")

    def tearDown(self):
        name = self.writer.tableName
        client = self.writer.client
        client.disableTable(name)
        client.deleteTable(name)

    def testReverseUrl(self):
        self.assertEquals(self.writer.reverseUrl("http://www.a.com"), "com.a.www")
        self.assertEquals(self.writer.reverseUrl("http://www.a.com/"), "com.a.www")
        self.assertEquals(self.writer.reverseUrl("http://a.com"), "com.a")
        self.assertEquals(self.writer.reverseUrl("http://www.b.com/foo"), "com.b.www/foo")
        self.assertEquals(self.writer.reverseUrl("aaa.bbb.ccc.com.cn/foo1/foo2"), "cn.com.ccc.bbb.aaa/foo1/foo2")

    def testCreate(self):
        tableName = self.writer.tableName
        client = self.writer.client
        self.assertTrue(self.writer.tableName in client.getTableNames())
        columns = dict()
        columns["contents"] = ColumnDescriptor(name="contents", maxVersions=1, compression="BLOCK")
        cds = client.getColumnDescriptors(tableName)
        for name,column in cds.items():
            self.assertTrue(column.name in columns)

    def testWrite(self):
        tableName = self.writer.tableName
        client = self.writer.client
        data = {"http://www.a.com":"com.a.www",
                "http://www.a.com/bbb":"com.a.www/bbb",
                "http://www.foo.com/foo":"foo"}
        for url, content in data.items():
            self.writer.write(url, content)

        scannerId = client.scannerOpen(tableName, "", ["contents:",])
        while True :
            try:
                result = client.scannerGet(scannerId)
            except NotFound:
                break
            row = result.row
            contents = result.columns["contents:"].value
            url = "http://" + self.writer.reverseUrl(row)
            self.assertTrue(url in data)
            self.assertEqual(data[url], contents)
        client.scannerClose(scannerId)


if __name__ == "__main__":
    main()

8 Responses to “A simple key-value and distributed filesystem spider by Python, Hbase, Hadoop”

  1. Chris says:

    In today’s world, marketers reach inside the home and attempt to figure out not what’s good for your daughter, because that is not their business, but what deep desires they can manipulate, stimulate and ostensibly satisfy in order to produce cold, hard cash.

  2. ruth says:

    Nothing changes your opinion of a friend so surely as success – yours or his.

  3. Laurella says:

    Always bear in mind that your own resolution to succeed is more important than any one thing.

  4. ettore says:

    But the body is deeper than the soul and its secrets inscrutable.

  5. tristan says:

    If your success is not on your own terms, if it looks good to the world but does not feel good in your heart, it is not success at all.

  6. rey says:

    They always say time changes things, but you actually have to change them yourself.

  7. colcmlfu says:

    colcmlfu…

    colcmlfu…

  8. cbrwpvdk says:

    cbrwpvdk…

    cbrwpvdk…

Leave a Reply

You must be logged in to post a comment.