Submitted by: דניאל
import requests from lxml import html import os import csv import xlsxwriter # Author: Pablo Rotem # Developer URL: https://pablo-guides.com def scrape_site(url_list, xpaths, output_file, output_type): output_data = [] for url in url_list: try: response = requests.get(url) tree = html.fromstring(response.content) extracted_data = {} extracted_data['field_0'] = tree.xpath('//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[2]/div/div/span/div/div/div[2]/span/a/div/img') or tree.cssselect('//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[2]/div/div/span/div/div/div[2]/span/a/div/img') extracted_data['field_1'] = tree.xpath('//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[2]/div/div/span/div/div/div[3]/div[1]/h2/a/span') or tree.cssselect('//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[2]/div/div/span/div/div/div[3]/div[1]/h2/a/span') extracted_data['field_2'] = tree.xpath('//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[2]/div/div/span/div/div/div[3]/div[3]/div/div[1]/a/span/span[2]/span[2]') or tree.cssselect('//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[2]/div/div/span/div/div/div[3]/div[3]/div/div[1]/a/span/span[2]/span[2]') output_data.append(extracted_data) except Exception as e: print(f'Error scraping {url}: {e}') if output_type == 'csv': with open(output_file, 'w', newline='') as f: writer = csv.DictWriter(f, fieldnames=[f'field_2' for index in range(len(xpaths))]) writer.writeheader() writer.writerows(output_data) elif output_type == 'xlsx': workbook = xlsxwriter.Workbook(output_file) worksheet = workbook.add_worksheet() for row_num, data in enumerate(output_data): worksheet.write_row(row_num, 0, data.values()) workbook.close() else: with open(output_file, 'w') as f: for row in output_data: f.write(', '.join(row.values()) + 'n') def main(): xpaths = array ( 0 => '//*[@id=\"search\"]/div[1]/div[1]/div/span[1]/div[1]/div[2]/div/div/span/div/div/div[2]/span/a/div/img', 1 => '//*[@id=\"search\"]/div[1]/div[1]/div/span[1]/div[1]/div[2]/div/div/span/div/div/div[3]/div[1]/h2/a/span', 2 => '//*[@id=\"search\"]/div[1]/div[1]/div/span[1]/div[1]/div[2]/div/div/span/div/div/div[3]/div[3]/div/div[1]/a/span/span[2]/span[2]', ) url_list = array ( 0 => 'https://www.amazon.com/s?i=specialty-aps&bbn=16225007011&rh=n%3A16225007011%2Cn%3A172456&language=he&ref=nav_em__nav_desktop_sa_intl_computer_accessories_and_peripherals_0_2_6_2', ) output_file = 'amazon1' output_type = 'csv' scrape_site(url_list, xpaths, output_file, output_type) if __name__ == '__main__': main()