FAQ
hey guys...

got a weird, hopefully simple issue.

the following sample bit of script is stripped down, and simply gets the
"form" node from the specified site "schedule.psu.edu".

the problem i run into is that the dom/xpath from the libxml2dom works, and
i get the dom object everytime i run the app, but that the xpath is
intermittent!!! in other words, i can run the script 10 times.. and it might
work 7 or 8 times.. the other times, the xpath doesn't give the nodes
back...

when it works, name1_ in the app should be a list of nodes (for the 2 forms
in the page). and len_ should be 2.

is there anything you might suggest that i try in order to get a better
handle on exactly what might be going on here...

keep in mind, i'm not a python guy, just trying to get this to consistently
work... my suspicion is that the culprit might be memory related...

i'm running linux, on a x86 dual core with 4G ram. the python is 2.5.1.

thoughts/comments/etc would be appreciated...

-thanks!!!


#!/usr/bin/python
#
# test.py
#
# scrapes/extracts the basic data for the college
#
#
# the app gets/stores
# name
# url
# address (street/city/state
# phone
#
######################################################################
#test python script
import re
import libxml2dom
import urllib
import urllib2
import sys, string
from mechanize import Browser
import mechanize
#import tidy
import os.path
import cookielib
from libxml2dom import Node
from libxml2dom import NodeList
import subprocess
import time

########################
#
# Parse pricegrabber.com
########################
##cj = "p"
##COOKIEFILE = 'cookies.lwp'
#cookielib = 1


urlopen = urllib2.urlopen
#cj = urllib2.cookielib.LWPCookieJar()
##cj = cookielib.LWPCookieJar()
Request = urllib2.Request
br = Browser()
br2 = Browser()

##if cj != None:
## print "sss"
###install the CookieJar for the default CookieProcessor
## if os.path.isfile(COOKIEFILE):
## cj.load(COOKIEFILE)
## print "foo\n"
## if cookielib:
## opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
## urllib2.install_opener(opener)
## print "foo2\n"

user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values1 = {'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'Python' }
headers = { 'User-Agent' : user_agent }

url="http://schedule.psu.edu/"
#=======================================


if __name__ == "__main__":
# main app

txdata = None

#----------------------------

##br.set_cookiejar(cj)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.addheaders = [('User-Agent', 'Firefox')]

print "url =",url
br.open(url)
##cj.save(COOKIEFILE) # resave cookies

res = br.response() # this is a copy of response
s = res.read()
print "slen=",len(s)

# s contains HTML not XML text
d = libxml2dom.parseString(s, html=1)
print "d",d

name_=[]
len_=0
name_ = d.xpath("//form")
#name_ = d.xpath("/html/body/form")
print "name1",name_
len_ = len(name_)
print "len",len(name_)
#print "sdlfs"
sys.exit()
# else:
# print "err in form_ID"


print "here..."

Search Discussions

  • Bruce at Aug 25, 2008 at 6:20 pm
    never mind...

    it was an issue with the targeted site... it's sending screwed up html the
    times when i get an err...

    thanks though!


    -----Original Message-----
    From: python-list-bounces+bedouglas=earthlink.net at python.org
    [mailto:python-list-bounces+bedouglas=earthlink.net at python.org]On Behalf
    Of bruce
    Sent: Monday, August 25, 2008 4:49 AM
    To: python-list at python.org
    Subject: python/xpath issue..


    hey guys...

    got a weird, hopefully simple issue.

    the following sample bit of script is stripped down, and simply gets the
    "form" node from the specified site "schedule.psu.edu".

    the problem i run into is that the dom/xpath from the libxml2dom works, and
    i get the dom object everytime i run the app, but that the xpath is
    intermittent!!! in other words, i can run the script 10 times.. and it might
    work 7 or 8 times.. the other times, the xpath doesn't give the nodes
    back...

    when it works, name1_ in the app should be a list of nodes (for the 2 forms
    in the page). and len_ should be 2.

    is there anything you might suggest that i try in order to get a better
    handle on exactly what might be going on here...

    keep in mind, i'm not a python guy, just trying to get this to consistently
    work... my suspicion is that the culprit might be memory related...

    i'm running linux, on a x86 dual core with 4G ram. the python is 2.5.1.

    thoughts/comments/etc would be appreciated...

    -thanks!!!


    #!/usr/bin/python
    #
    # test.py
    #
    # scrapes/extracts the basic data for the college
    #
    #
    # the app gets/stores
    # name
    # url
    # address (street/city/state
    # phone
    #
    ######################################################################
    #test python script
    import re
    import libxml2dom
    import urllib
    import urllib2
    import sys, string
    from mechanize import Browser
    import mechanize
    #import tidy
    import os.path
    import cookielib
    from libxml2dom import Node
    from libxml2dom import NodeList
    import subprocess
    import time

    ########################
    #
    # Parse pricegrabber.com
    ########################
    ##cj = "p"
    ##COOKIEFILE = 'cookies.lwp'
    #cookielib = 1


    urlopen = urllib2.urlopen
    #cj = urllib2.cookielib.LWPCookieJar()
    ##cj = cookielib.LWPCookieJar()
    Request = urllib2.Request
    br = Browser()
    br2 = Browser()

    ##if cj != None:
    ## print "sss"
    ###install the CookieJar for the default CookieProcessor
    ## if os.path.isfile(COOKIEFILE):
    ## cj.load(COOKIEFILE)
    ## print "foo\n"
    ## if cookielib:
    ## opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    ## urllib2.install_opener(opener)
    ## print "foo2\n"

    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    values1 = {'name' : 'Michael Foord',
    'location' : 'Northampton',
    'language' : 'Python' }
    headers = { 'User-Agent' : user_agent }

    url="http://schedule.psu.edu/"
    #=======================================


    if __name__ == "__main__":
    # main app

    txdata = None

    #----------------------------

    ##br.set_cookiejar(cj)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)
    br.addheaders = [('User-Agent', 'Firefox')]

    print "url =",url
    br.open(url)
    ##cj.save(COOKIEFILE) # resave cookies

    res = br.response() # this is a copy of response
    s = res.read()
    print "slen=",len(s)

    # s contains HTML not XML text
    d = libxml2dom.parseString(s, html=1)
    print "d",d

    name_=[]
    len_=0
    name_ = d.xpath("//form")
    #name_ = d.xpath("/html/body/form")
    print "name1",name_
    len_ = len(name_)
    print "len",len(name_)
    #print "sdlfs"
    sys.exit()
    # else:
    # print "err in form_ID"


    print "here..."

Related Discussions

Discussion Navigation
viewthread | post
Discussion Overview
grouppython-list @
categoriespython
postedAug 25, '08 at 11:49a
activeAug 25, '08 at 6:20p
posts2
users1
websitepython.org

1 user in discussion

Bruce: 2 posts

People

Translate

site design / logo © 2022 Grokbase