|
- import re
- import sys
- import urllib
- import urllib.request
- from utils import pypi_utils
- from utils import database_utils_pool
- from bs4 import BeautifulSoup
- import os
- import zipfile
- import tarfile
- import dependency_analyzer as a
- from utils.pypi_utils import PYPIUtils
- from utils import directory_utils
- from utils import business_utils
- import datetime
- import ssl
-
- ssl._create_default_https_context = ssl._create_unverified_context
-
-
- download_path = ''
- uncompress_path = ''
- # 获取 pypi 网站上的URL
- # 新加入
- def get_file_download_url_from_pypi(date_html):
- href_egg = ''
- href_tar_gz = ''
- href_whl = ''
- href_zip = ''
- soup = BeautifulSoup(date_html, 'html.parser')
- download_table = soup.find('table', class_='table table--downloads')
- if download_table is not None:
- try:
- tr = download_table.find_all('tr')
- for j in tr[1:]:
- name = j.find_all('a')[0].get_text().replace('\r', '').strip()
- if '.whl' in name:
- a_whl = j.find_all('a')[0]
- href_whl = a_whl.get('href')
- href_whl = str(href_whl).replace('\r', '').strip()
- break
- elif '.zip' in name:
- a_zip = j.find_all('a')[0]
- href_zip = a_zip.get('href')
- href_zip = str(href_zip).replace('\r', '').strip()
- break
- elif '.egg' in name:
- a_egg = j.find_all('a')[0]
- href_egg = a_egg.get('href')
- href_egg = str(href_egg).replace('\r', '').strip()
- break
- elif '.tar.gz' in name:
- a_tar_gz = j.find_all('a')[0]
- href_tar_gz = a_tar_gz.get('href')
- href_tar_gz = str(href_tar_gz).replace('\r', '').strip()
- if href_whl != '':
- href = href_whl
- elif href_zip != '':
- href = href_zip
- elif href_tar_gz != '':
- href = href_tar_gz
- elif href_egg != '':
- href = href_egg
- else:
- href = ''
- print("获取 pypi_href 失败")
- return href, name
- except Exception as e:
- print("get_file_download_url_from_pypi error :" + str(e))
- return "", ""
- else:
- download_table = soup.find(id = 'files')
- try:
- tr = download_table.find_all('div',class_ = 'card file__card')
- for j in tr:
- name = j.find('a')
- name = name.get_text().replace('\r', '').strip()
- if '.whl' in name:
- a_whl = j.find('a')
- href_whl = a_whl.get('href')
- href_whl = str(href_whl).replace('\r', '').strip()
- break
- elif '.zip' in name:
- a_zip = j.find('a')
- href_zip = a_zip.get('href')
- href_zip = str(href_zip).replace('\r', '').strip()
- break
- elif '.egg' in name:
- a_egg = j.find('a')
- href_egg = a_egg.get('href')
- href_egg = str(href_egg).replace('\r', '').strip()
- break
- elif '.tar.gz' in name:
- a_tar_gz = j.find('a')
- href_tar_gz = a_tar_gz.get('href')
- href_tar_gz = str(href_tar_gz).replace('\r', '').strip()
- if href_whl != '':
- href = href_whl
- elif href_zip != '':
- href = href_zip
- elif href_tar_gz != '':
- href = href_tar_gz
- elif href_egg != '':
- href = href_egg
- else:
- href = ''
- print("获取 pypi_href 失败")
- return href, name
- except Exception as e:
- print("get_file_download_url_from_pypi error :" + str(e))
- return "", ""
-
- # 获取包的更新日期
- # 新加入
- def get_file_update_time_from_pypi(date_html):
- soup = BeautifulSoup(date_html, 'html.parser')
- tr = soup.find('table', class_='table table--downloads')
- if tr is not None:
- tr = tr.find_all('tr')
- time = tr[1].find_all('time')[0]
- else :
- tr = soup.find(id = 'files').find_all('div',class_ = 'card file__card')
- time = tr[0].find('time')
- update_time = time.get('datetime')
- update_time = update_time.replace('T', " ").replace('+0000', "")
- return update_time
-
-
- # 选择合适的下载文件的地址,执行下载,并将文件名插入到数据库中
- def parse_first_page(fileid, filename, file_num):
- utils = PYPIUtils()
- f_list = utils.find_all_file_version(filename)
-
- for f_version in f_list:
- exist_in_pypi_info_version = business_utils.is_exist_in_pypi_info_version(filename, f_version)
- exist_in_pypi_info_version_all = business_utils.is_exist_in_pypi_info_version_all(filename, f_version)
-
- if exist_in_pypi_info_version is False or exist_in_pypi_info_version_all is False:
- date_html = ''
- date_html2 = ''
- url = 'https://pypi.org/project/' + filename + '/' + f_version + '/#files'
- try:
- date_html = pypi_utils.get_page(url)
- except Exception as e:
- print("parse_first_page url error " + str(e))
- pass
-
- url2 = "https://pypi.tuna.tsinghua.edu.cn/simple/" + filename + '/'
- try:
- date_html2 = pypi_utils.get_page(url2)
- except Exception as e:
- print("parse_first_page url2 error " + str(e))
- pass
-
- href, version_name = get_file_download_url_from_pypi(date_html)
- if href != "" and version_name != "":
- update_time = get_file_update_time_from_pypi(date_html)
- # print("version_name:", version_name)
- if exist_in_pypi_info_version == 0:
- pattern_second = re.findall(r'<a (.*?)/a>', date_html2)
- download_url = []
- for item in pattern_second:
- download_url = re.findall('href="../..(.*?)">' + str(version_name) + '<', item, re.S)
- if download_url:
- url = "https://pypi.tuna.tsinghua.edu.cn" + download_url[0]
- break
- if not download_url:
- url = href
- # print(str(version_name) + " 未匹配到清华镜像下载地址")
-
- try:
- download_file(url, version_name, file_num, filename, f_version)
- business_utils.insert_file_name_with_version(fileid, version_name, filename, f_version)
- except Exception as e:
- print(str(e) + "download error:", version_name)
- pass
- else:
- continue
-
- if exist_in_pypi_info_version_all is False:
- business_utils.insert_file_name_with_version_and_update(filename, f_version, update_time, fileid)
- # print("----------------------------------------------------------\n")
- # else:
- # print("该文件版本已存在: ", "filename:", filename, ", version:", f_version,
- # "\n----------------------------------------------------------\n")
-
-
- # 下载文件
- def download_file(url, version_name, file_num, database_file_name, database_file_version):
- file_type = version_name.split('.')[-1]
- download_file_name = database_file_name + '-' + database_file_version
-
- if file_type == 'gz':
- download_file_name += '.tar.gz'
- else:
- download_file_name += '.zip'
- filename = download_path + '/' + download_file_name
- # print('开始下载 : %s' % filename) # 开始下载
- # print("网页文件:" + version_name)
- # print("下载文件:" + download_file_name)
- urllib.request.urlretrieve(url, filename)
- # urllib.request.urlretrieve(url, filename_new)
- # print('下载完成 : %s' % filename) # 完成下载
- uncompress(download_path, uncompress_path, download_file_name, file_num, database_file_name, database_file_version)
- # print('解压完成 : %s' % filename)
-
-
- # 解压缩并保留setup,requirement
- def uncompress(path, save_path, file_name, file_num, database_file_name, database_file_version):
- save_path = save_path + "/uncompress" + str(file_num)
- try:
- uncompress_file_name = database_file_name + "-" + database_file_version
- uncompress_file_path = save_path + '/' + uncompress_file_name
- file_path = path + '/' + file_name
- if os.path.splitext(file_name)[1] == '.zip':
- isExists = os.path.exists(uncompress_file_path)
- # 判断路径是否存在
- if not isExists:
- # 不存在则创建目录
- os.makedirs(uncompress_file_path)
- file_zip = zipfile.ZipFile(file_path, 'r')
- for file in file_zip.namelist():
- file_zip.extract(file, path=uncompress_file_path)
- file_zip.close()
-
- elif os.path.splitext(file_name)[1] == '.gz':
- isExists = os.path.exists(uncompress_file_path)
- # 判断路径是否存在
- if not isExists:
- os.makedirs(uncompress_file_path)
- t = tarfile.open(file_path)
- t.extractall(uncompress_file_path)
- t.close()
-
- # 删除无用包,删除压缩文件
- # print("path_uncompress:", uncompress_file_path)
- directory_utils.delete_useless_file(uncompress_file_path)
- os.remove(file_path)
-
- # R.release()
- except Exception as e:
- os.remove(file_path)
- print("解压缩失败:" + str(e), file_name)
-
-
- # 将带文件名与其版本写入数据库
- def crawl_file_from_version():
- # SQL 查询语句
- sql = "SELECT * FROM pypi_info"
- try:
- data = database_utils_pool.fetchall(sql) # 获取所有的数据
- except Exception as e:
- print("allversionname() dberror" + str(e))
-
- num = 0
- file_num=0
- for i in data:
- flag = num % 100
- file_num = num // 100
- fileid = i['file_id']
- filename = i['file_name']
- if flag == 0 and file_num != 0:
- analysis_path = uncompress_path+r'/uncompress'+str(file_num-1)
- if os.path.exists(analysis_path):
- a.path = analysis_path
- a.analysis(analysis_path)
- try:
- directory_utils.delete_all_file(analysis_path)
- directory_utils.delete_all_dir(analysis_path)
- except Exception as e:
- print(str(e))
- pass
- try:
- print("file_id:" + str(fileid))
- # executor.submit(parse_first_page, fileid, filename, file_num)
- parse_first_page(fileid, filename, file_num)
- sys.stdout.flush()
- except Exception as e:
- print(str(e))
- sys.stdout.flush()
- pass
- num = num + 1
- analysis_path = uncompress_path+r'/uncompress'+str(file_num)
- if os.path.exists(analysis_path):
- a.path = analysis_path
- a.analysis(analysis_path)
- try:
- directory_utils.delete_all_file(analysis_path)
- directory_utils.delete_all_dir(analysis_path)
- except Exception as e:
- print(str(e))
- pass
-
-
- # 爬取PYPI库中包含的所有文件的名字
- def crawl_all_file_name():
- url = "https://pypi.tuna.tsinghua.edu.cn/simple/"
- headers = {'User-Agent': 'User-Agent:Mozilla/5.0'}
- data1 = urllib.request.Request(url, headers=headers)
- html = urllib.request.urlopen(data1).read()
- data = html.decode('utf-8')
- soup = BeautifulSoup(data, 'html.parser')
-
- for item in soup.find_all("a"):
- file_name_db = item.string
- # print()
- if file_name_db == None:
- continue
- else:
- business_utils.insert_file_name(file_name_db)
|