Categorías
Otros

Cuando se utiliza Python para rastrear un rastreador, siempre habrá errores y dejar de ejecutarse, y preguntar multiprocessing.pool.RemoteTraceback, voy a publicar todo el código fuente, por favor ayuda, gracias!

Código fuente:

import requests  # Crawl page
import logging  # Output information
import re  # Regular expression
import pymongo  # Storing data
from pyquery import PyQuery as pq  # Parse the webpage directly
from urllib.parse import urljoin  # URL stitching
import multiprocessing

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s: %(message)s')

BASE_URL = 'https://static1.scrape.cuiqingcai.com'
TOTAL_PAGE = 10
MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
MONGO_DB_NAME = 'movies'  # Name database
MONGO_COLLECTION_NAME = 'MOVIES'  # Collection name

client = pymongo.MongoClient(MONGO_CONNECTION_STRING)
db = client['movies']
collection = db['movies']


def scrape_page(url):
    """List crawling method"""
    logging.info('scraping %s...', url)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    except requests.RequestException:
        logging.error('error occurred while scraping %s', url, exc_info=True)


def scrape_index(page):
    """List page crawling method"""
    index_url = f'{BASE_URL}/page/{page}'
    return scrape_page(index_url)


def parse_index(html):
    """Parsing List Page"""
    doc = pq(html)
    links = doc('.el-card .name')  # CSS selector
    for link in links.items():
        href = link.attr('href')
        detail_url = urljoin(BASE_URL, href)
        logging.info('get detail url %s', detail_url)
        yield detail_url


def scrape_detail(url):
    return scrape_page(url)


def parse_detail(html):
    doc = pq(html)
    cover = doc('img.cover').attr('src')
    name = doc('a > h2').text()
    categories = [item.text() for item in doc('.categories button span').items()]
    published_at = doc('.info:contains (released)').text()
    published_at = re.search('(d{4}-d{2}-d{2})', published_at).group(1) 
        if published_at and re.search('d{4}-d{2}-d{2}', published_at) else None
    drama = doc('.drama p').text()
    score = doc('p.score').text()
    score = float(score) if score else None
    return {
        'cover': cover,
        'name': name,
        'categories': categories,
        'published_at': published_at,
        'drama': drama,
        'score': score
    }


def save_data(data):
    collection.update_one({'name': data.get('name')}, {'$set': data}, upsert=True)


def main(page):
    index_html = scrape_index(page)
    detail_urls = parse_index(index_html)
    for detail_url in detail_urls:
        detail_html = scrape_detail(detail_url)
        data = parse_detail(detail_html)
        logging.info('get detail data %s', data)
        logging.info('saving data to mongodb')
        save_data(data)
        logging.info('data saved successfully')


if __name__ == '__main__':
   
    pool = multiprocessing.Pool()
    pages = range(1, TOTAL_PAGE + 1)
    pool.map(main, pages)
    pool.close()
    pool.join()


Fue capaz de correr normalmente al principio, pero se detuvo después de un corto tiempo.
El mensaje de error es:

multiprocessing.pool.RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:UsersAdministratorAppDataLocalProgramsPythonPython38-32libmultiprocessingpool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "C:UsersAdministratorAppDataLocalProgramsPythonPython38-32libmultiprocessingpool.py", line 48, in mapstar
    return list(map(*args))
  File "C:UsersAdministratorPycharmProjects20_Requests+Requery+MongoDB basic case actual combat3_Multi-process crawling.py", line 91, in main
    data = parse_detail(detail_html)
     File "C:UsersAdministratorPycharmProjects20_Requests+Requery+MongoDB basic case combat3_Multi-process crawling.py", line 56, in parse_detail
    doc = pq(html)
  File "C:UsersAdministratorAppDataLocalProgramsPythonPython38-32libsite-packagespyquery-1.4.1-py3.8.eggpyquerypyquery.py", line 267, in __init__
    raise TypeError(context)
TypeError: None
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:/Users/Administrator/PycharmProjects/20_Requests+Requery+MongoDB basic case combat/3_Multi-process crawling.py", line 102, in <module>
    pool.map(main, pages)
  File "C:UsersAdministratorAppDataLocalProgramsPythonPython38-32libmultiprocessingpool.py", line 364, in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
  File "C:UsersAdministratorAppDataLocalProgramsPythonPython38-32libmultiprocessingpool.py", line 768, in get
    raise self._value
TypeError: None

.

  Cómo jugar EV4 o EvPlayer?

Por Programación.Click

Más de 20 años programando en diferentes lenguajes de programación. Apasionado del code clean y el terminar lo que se empieza. ¿Programamos de verdad?

Deja una respuesta

Tu dirección de correo electrónico no será publicada. Los campos obligatorios están marcados con *