nopcommerce monitor duplicated attributes
import requests
import re
import csv
root = "websites"
a = open("./alertau.txt","wb")
columns = {"SKU":0,"Type":1,"Colour":2,"Media":3,"Pore size":4,"Length":5,"i.d.":6,"Volume":7,"Diameter":8,"Pack Size":9}
skus = []
skus.append(["SKU","Type","Colour","Media","Pore size","Length","i.d.","Volume","Diameter","Pack Size"])
def process():
url = "websites"
html = requests.get(url).content
cs = extract(html)
for c in cs:
visit(c)
write2file()
def write2file():
csvfile = file('./au.csv', 'wb')
writer = csv.writer(csvfile)
writer.writerows(skus)
def visit(c):
url = root + c
print url
html = requests.get(url).content
maxnum = findmax(html)
print maxnum
for i in range(1,maxnum+1):
visitv(c,i)
def visitv(c,i):
url = root + c + "?pagenumber=%d" % i
print url
html = requests.get(url).content
if iscat(html):
links = extractlink(html)
print links
for link in links:
visit(link)
else:
extractproduct(html)
def findmax(html):
res = re.findall("pagenumber=(\d+)\">Last</a>", html, re.I|re.M|re.DOTALL)
print res
if res:
return int(res[0])
res = re.findall("pagenumber=(\d+)\">\d+", html, re.I|re.M|re.DOTALL)
if res:
return int(res[-1])
return 1
def iscat(html):
if html.find("<span class=\"label\">SKU: </span>") != -1:
return False
return True
def extractproduct(html):
sku = extractsku(html)
attr = extractattr(html)
print sku
print attr
ats = set([])
r = False
for k,v in attr:
if k in ats:
r = True
else:
ats.add(k)
if r:
a.write("%s\n" % sku)
a.flush()
export(sku,attr)
def export(sku,attr):
p = ["","","","","","","","","",""]
p[0] = sku
for k,v in attr:
p[columns[k]] = v
skus.append(p)
def extractsku(html):
return re.findall("<span class=\"value\" itemprop=\"sku\" id=\"sku\-\d+\">(.*?)</span>", html, re.I|re.M|re.DOTALL)[0]
def extractattr(html):
attr = []
res = re.findall("<td class=\"a-left spec\-name\">(.*?)</td>\s+<td class=\"a-left spec-value\">(.*?)</td>",html,re.I|re.M|re.DOTALL)
for a in res:
attr.append([clean(a[0]),clean(a[1])])
return attr
def clean(a):
return a.replace("\r\n","").strip()
def extractlink(html):
res = []
res1 = re.findall("<h2 class=\"title\">\s+<a href=\"(/.*?)\"", html, re.I|re.M|re.DOTALL)
res2 = re.findall("<h2 class=\"product\-title\">\s+<a href=\"(/.*?)\"", html, re.I|re.M|re.DOTALL)
for i in res1:
res.append(i)
for i in res2:
res.append(i)
return res
def extract(html):
res = re.findall("<li class=\"inactive\">\s+<a href=\"(/.*?)\">.*?</a>\s+</li>",html,re.I|re.M|re.DOTALL)
return [i for i in res]
if __name__ == "__main__":
process()