nopcommerce monitor duplicated attributes

nopcommerce monitor duplicated attributes

import requests
import re
import csv
root = "websites"
a = open("./alertau.txt","wb")
columns = {"SKU":0,"Type":1,"Colour":2,"Media":3,"Pore size":4,"Length":5,"i.d.":6,"Volume":7,"Diameter":8,"Pack Size":9}
skus = []
skus.append(["SKU","Type","Colour","Media","Pore size","Length","i.d.","Volume","Diameter","Pack Size"])
def process():
    url = "websites"
    html = requests.get(url).content
    cs = extract(html)
    for c in cs:
        visit(c)
    write2file()
def write2file():
    csvfile = file('./au.csv', 'wb')
    writer = csv.writer(csvfile)
    writer.writerows(skus)
def visit(c):
    url = root + c
    print url
    html = requests.get(url).content
    maxnum = findmax(html)
    print maxnum
    for i in range(1,maxnum+1):
        visitv(c,i)
def visitv(c,i):
    url = root + c + "?pagenumber=%d" % i
    print url
    html = requests.get(url).content
    if iscat(html):
        links = extractlink(html)
        print links
        for link in links:
            visit(link)
    else:
        extractproduct(html)
def findmax(html):
    res = re.findall("pagenumber=(\d+)\">Last</a>", html, re.I|re.M|re.DOTALL)
    print res
    if res:
        return int(res[0])
    res = re.findall("pagenumber=(\d+)\">\d+", html, re.I|re.M|re.DOTALL)
    if res:
        return int(res[-1])
    return 1
def iscat(html):
    if html.find("<span class=\"label\">SKU: </span>") != -1:
        return False
    return True
def extractproduct(html):
    sku = extractsku(html)
    attr = extractattr(html)
    print sku
    print attr
    ats = set([])
    r = False
    for k,v in attr:
        if k in ats:
            r = True
        else:
            ats.add(k)
    if r:
        a.write("%s\n" % sku)
        a.flush()
    export(sku,attr)
def export(sku,attr):
    p = ["","","","","","","","","",""]
    p[0] = sku
    for k,v in attr:
        p[columns[k]] = v
    skus.append(p)
def extractsku(html):
    return re.findall("<span class=\"value\" itemprop=\"sku\" id=\"sku\-\d+\">(.*?)</span>", html, re.I|re.M|re.DOTALL)[0]
def extractattr(html):
    attr = []
    res = re.findall("<td class=\"a-left spec\-name\">(.*?)</td>\s+<td class=\"a-left spec-value\">(.*?)</td>",html,re.I|re.M|re.DOTALL)
    for a in res:
        attr.append([clean(a[0]),clean(a[1])])
    return attr
def clean(a):    
    return a.replace("\r\n","").strip()
def extractlink(html):
    res = []
    res1 = re.findall("<h2 class=\"title\">\s+<a href=\"(/.*?)\"", html, re.I|re.M|re.DOTALL)
    res2 = re.findall("<h2 class=\"product\-title\">\s+<a href=\"(/.*?)\"", html, re.I|re.M|re.DOTALL)
    for i in res1:
        res.append(i)
    for i in res2:
        res.append(i)
    return res
def extract(html):
    res = re.findall("<li class=\"inactive\">\s+<a href=\"(/.*?)\">.*?</a>\s+</li>",html,re.I|re.M|re.DOTALL)
    return [i for i in res]

if __name__ == "__main__":
    process()