python爬立创商城

嘉立创的PCB, SMT和立创商城配合起来使用,确实大大提高了硬件工程师打板的销量,但是立创EDA离cadence,ad,pads还有很大的差距(5年?),所以打算把所有数据爬下来,导入数据库,方便这些eda软件使用。

用postman检测测试一下api

def parseItem(url):
    data={}
    def getSvb(pn):
        svgPath=[]
        url='https://lceda.cn/api/products/%s/svgs' % pn
        ret=requests.get(url)
        val=json.loads(ret.text)
        if val['success']:
            svgs=val['result']
            for (index,svg) in enumerate(svgs):
                path='svg%d.svg'%index
                svgPath.append(path)
                f=open(path, mode='w')
                f.write(svg['svg'])
                f.close()
        return svgPath
        

    ret = requests.get(url=url)

    soup = BeautifulSoup(ret.text, 'lxml')  

    

    #get download pdf
    try:
        tmp=soup.find(id='lookAllPdf')
        matches=re.findall(r"downloadFileNoRemark\(\'(.*?)\'",str(tmp),re.MULTILINE)
        data['pdfurl']=matches[0].replace("%3F", "?").replace("%3D", "=")
    except:
        data['pdfurl']=''
    #get product name
    tmp=soup.find(class_='product-name')
    data['value']=tmp.text.replace(' ','').replace('\n','')
    #get price
    tmp=soup.findAll(class_='sample-list-tr')
    prices={}
    for price in tmp:
        val=price.text.replace(' ','').replace('\n','').split(':¥')
        prices[val[0]]=val[1]
    data['prices']=prices
    #get type
    tmp=soup.find(class_='param-body').findAll('td')[1]
    type_=tmp.text
    #get decription
    tmp=soup.find(class_='param-body').findAll('td')[2:]
    desc=''
    cnt=0
    for item in tmp:
        if item.find('input') is None:
            if cnt%2 is 0:
                desc+=item.text+':'
            else:
                desc+=item.text+','
            cnt+=1 
    data['desc']=desc
    #JLCpN
    tmp=soup.find(class_='product-brand-con').findAll(class_='item')
    pn=''
    for item in tmp:
        findpn=(re.findall(r"商品编号: *(C.*)",item.text.replace('\n','')))
        if len(findpn)>0:
            pn=findpn[0]
            break
    data['pn']=pn
    fp=''
    for item in tmp:
        findfp=(re.findall(r"封装: *(.*)",item.text.replace('\n','')))
        if len(findfp)>0:
            fp=findfp[0]
            break
    data['fp']=fp
    #svb
    data['svgPath']=getSvb(pn)
    #image
    try:
        tmp=soup.find(class_='thum-cont').findAll('img')
    except:
        tmp=[]
    imgUrl=[]
    for img in tmp:
        imgUrl.append(img['src'].replace('https://alimg.szlcsc.com/upload/public/product/breviary/','https://alimg.szlcsc.com/upload/public/product/source/'))
    data['imgUrl']=imgUrl
    data['url']=url
    data['type']=type_
    return data


工信部备案:渝ICP备16001374号-1