html text全删,删除不带/text()的HTML标记。extract()

本文介绍了一个使用Scrapy爬虫抓取HTML数据并尝试清除其中的HTML标签的问题。作者希望通过简单的手段去除不需要的HTML标记,如<td>等,并寻求一种适合初学者的方法。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

首先,我对这一切都很陌生,所以准备好一些我从各种来源复制/粘贴的代码。

我希望能够删除任何垃圾返回的html代码。我把所有东西都存储在MySQL中,没有任何问题,但我现在还不能做的是删除很多“<td>”和其他html标记。我最初只是使用/text().extract()运行,但它会随机遇到一个按如下方式格式化的单元格:

TEXT Text Text Text Text

它没有一个模式可以让我在使用/文本与否之间做出选择,我正在寻找一个初学者能够实现的最简单的方法来消除所有这些。from scrapy.spider import BaseSpider

from scrapy.selector import HtmlXPathSelector

from scrapy.contrib.loader import XPathItemLoader

from scrapy.contrib.loader.processor import Join, MapCompose

import html2text

from scraper.items import LivingSocialDeal

class CFBDVRB(BaseSpider):

name = "cfbdvrb"

allowed_domains = ["url"]

start_urls = [

"url",

]

deals_list_xpath = '//table[@class="tbl data-table"]/tbody/tr'

item_fields = {

'title': './/td[1]',

'link': './/td[2]',

'location': './/td[3]',

'original_price': './/td[4]',

'price': './/td[5]',

}

def parse(self, response):

selector = HtmlXPathSelector(response)

for deal in selector.xpath(self.deals_list_xpath):

loader = XPathItemLoader(LivingSocialDeal(), selector=deal)

# define processors

loader.default_input_processor = MapCompose(unicode.strip)

loader.default_output_processor = Join()

# iterate over fields and add xpaths to the loader

for field, xpath in self.item_fields.iteritems():

loader.add_xpath(field, xpath)

converter = html2text.HTML2Text()

converter.ignore_links = True

yield loader.load_item()

converter=html2text是我最后一次尝试那样删除它,我不完全确定我是否正确实现了它,但它没有工作。

提前感谢你的帮助,如果我错过了一些简单的东西,我也很抱歉,快速搜索可以拉上来。

// 补状态结构 typedef struct { char **matches; // 所有匹配项 int count; // 匹配项总数 char *base_dir; // 当前基础目录 char *current_prefix; // 当前前缀 char *full_path; // 完整路径(用于上下文检测) int last_state; // 上一次使用的state值 int current_level; // 当前目录层级 int is_directory; // 标记当前是否在目录层级 } CompletionState; static CompletionState comp_state = { .matches = NULL, .count = 0, .base_dir = NULL, .current_prefix = NULL, .full_path = NULL, .last_state = -1, .current_level = 0, .is_directory = 0 }; // 重置补状态 void reset_completion_state() { if (comp_state.matches) { for (int i = 0; i < comp_state.count; i++) { XFREE(MTYPE_TMP, comp_state.matches[i]); } XFREE(MTYPE_TMP, comp_state.matches); comp_state.matches = NULL; } XFREE(MTYPE_TMP, comp_state.base_dir); XFREE(MTYPE_TMP, comp_state.current_prefix); XFREE(MTYPE_TMP, comp_state.full_path); comp_state.count = 0; comp_state.base_dir = NULL; comp_state.current_prefix = NULL; comp_state.full_path = NULL; comp_state.last_state = -1; comp_state.current_level = 0; comp_state.is_directory = 0; } // 路径提取函数 static const char *extract_path(const char *text) { // 跳过命令部分(直到第一个空格) const char *ptr = text; while (*ptr && *ptr != ' ') ptr++; // 如果没有空格,说明没有参数 if (*ptr == '\0') { return ""; } // 跳过所有连续空格 while (*ptr == ' ') ptr++; return ptr; } // 判断是否相同上下文 int is_same_completion_context(const char *text) { // 首次初始化 if (!comp_state.full_path) return 0; const char *path_text = extract_path(text); // 比较完整路径是否相同 return strcmp(comp_state.full_path, path_text) == 0; } // 路径解析函数 void parse_input_text(const char *text, char **base_dir, char **prefix, int *level) { *level = 0; const char *ptr = text; const char *last_char = text + strlen(text) - 1; // 计算目录层级 while (*ptr) { if (*ptr == '/') (*level)++; ptr++; } // 查找最后一个斜杠位置 const char *last_slash = strrchr(text, '/'); if (last_slash) { // 处理以斜杠结尾的情况 if (last_char == last_slash) { size_t base_len = last_slash - text + 1; *base_dir = (char *)XCALLOC(MTYPE_TMP, base_len + 1); strncpy(*base_dir, text, base_len); (*base_dir)[base_len] = '\0'; *prefix = XSTRDUP(MTYPE_TMP, ""); comp_state.is_directory = 1; // 标记为目录层级 } // 处理斜杠在中间的情况 else { size_t base_len = last_slash - text + 1; *base_dir = (char *)XCALLOC(MTYPE_TMP, base_len + 1); strncpy(*base_dir, text, base_len); (*base_dir)[base_len] = '\0'; *prefix = XSTRDUP(MTYPE_TMP, last_slash + 1); comp_state.is_directory = 0; // 标记为文件层级 } } else { *base_dir = XSTRDUP(MTYPE_TMP, ""); *prefix = XSTRDUP(MTYPE_TMP, text); comp_state.is_directory = 0; // 标记为文件层级 } } // 路径比较函数 int compare_paths(const void *a, const void *b) { const char *path1 = *(const char **)a; const char *path2 = *(const char **)b; return strcmp(path1, path2); } // 生成当前目录内容 char **generate_current_dir_paths(const char *base_dir, const char *prefix) { const char *scan_dir = (base_dir && *base_dir) ? base_dir : "."; printf("[generate_current_dir_paths] Scanning directory: '%s' with prefix '%s'\n", scan_dir, prefix ? prefix : "(none)"); DIR *dir = opendir(scan_dir); if (!dir) { return NULL; } int capacity = 32; int count = 0; char **matches = (char **)XCALLOC(MTYPE_TMP, capacity * sizeof(char *)); struct dirent *entry; while ((entry = readdir(dir)) != NULL) { char *name = entry->d_name; // 跳过特殊目录 if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) { continue; } // 跳过隐藏文件(除非用户明确输入点) if (name[0] == '.' && (!prefix || prefix[0] != '.')) { continue; } // 应用前缀过滤 if (prefix && *prefix && strncmp(name, prefix, strlen(prefix)) != 0) { continue; } // 检测目录类型 int is_dir = 0; if (entry->d_type == DT_DIR) { is_dir = 1; } else if (entry->d_type == DT_UNKNOWN) { char full_path[PATH_MAX]; snprintf(full_path, sizeof(full_path), "%s/%s", scan_dir, name); struct stat statbuf; if (stat(full_path, &statbuf) == 0 && S_ISDIR(statbuf.st_mode)) { is_dir = 1; } } // 创建斜杠的目录名 char *new_name; if (is_dir) { new_name = (char *)XCALLOC(MTYPE_TMP, strlen(name) + 2); sprintf(new_name, "%s/", name); } else { new_name = XSTRDUP(MTYPE_TMP, name); } // 添加到匹配列表 if (count >= capacity) { capacity *= 2; matches = (char **)XREALLOC(MTYPE_TMP, matches, capacity * sizeof(char *)); } matches[count++] = new_name; } closedir(dir); // 排序结果 if (count > 0) { qsort(matches, count, sizeof(char *), compare_paths); } // 添加NULL结束符 matches = (char **)XREALLOC(MTYPE_TMP, matches, (count + 1) * sizeof(char *)); matches[count] = NULL; printf("[generate_current_dir_paths] Found %d matches\n", count); return matches; } // 智能文件补函数 char *filename_completion_function(const char *text, int state) { const char *path_text = extract_path(text); printf("\n[COMP] Entering: text='%s', state=%d, last_state=%d, path_text = %s\n", text, state, comp_state.last_state, path_text); // 初始化新补 if (state == 0) { int same_context = is_same_completion_context(text); if (!same_context) { reset_completion_state(); char *current_base = NULL; char *current_prefix = NULL; int current_level = 0; parse_input_text(path_text, &current_base, &current_prefix, &current_level); printf(" Parsed: base='%s', prefix='%s', level=%d\n", current_base, current_prefix, current_level); // 保存解析结果 comp_state.base_dir = current_base; comp_state.current_prefix = current_prefix; comp_state.current_level = current_level; comp_state.full_path = XSTRDUP(MTYPE_TMP, path_text); // 生成匹配项 comp_state.matches = generate_current_dir_paths(current_base, current_prefix); // 计算匹配项数量 if (comp_state.matches) { comp_state.count = 0; while (comp_state.matches[comp_state.count]) { comp_state.count++; } } comp_state.last_state = -1; } } // 无匹配项时返回 if (!comp_state.matches || comp_state.count == 0) { printf("[COMP] No matches found\n"); return NULL; } // 循环获取下一个匹配项 int next_index = (comp_state.last_state + 1) % comp_state.count; comp_state.last_state = next_index; char *match = comp_state.matches[next_index]; printf(" Next index: %d/%d, match='%s'\n", next_index, comp_state.count - 1, match); // 构建完整路径 char *full_path = NULL; if (comp_state.base_dir && strlen(comp_state.base_dir) > 0) { int base_len = strlen(comp_state.base_dir); int needs_slash = (comp_state.base_dir[base_len - 1] != '/'); int len = base_len + strlen(match) + (needs_slash ? 1 : 0) + 1; full_path = (char *)XCALLOC(MTYPE_TMP, len); if (needs_slash) { snprintf(full_path, len, "%s/%s", comp_state.base_dir, match); } else { snprintf(full_path, len, "%s%s", comp_state.base_dir, match); } } else { full_path = XSTRDUP(MTYPE_TMP, match); } printf("[COMP] Returning: '%s'\n", full_path); return full_path; } // 补匹配函数(返回单个匹配项) char **filename_completion_matches(const char *text, CPFunction* genfunc) { printf("\n[MATCHES] Called with text='%s'\n", text); // 获取下一个匹配项 char *match = (*genfunc)(text, 0); if (!match) { printf("[MATCHES] No matches found\n"); return NULL; } // 创建只包含一个匹配项的数组 char **matches = (char **)XCALLOC(MTYPE_TMP, 2 * sizeof(char *)); matches[0] = match; matches[1] = NULL; printf("[MATCHES] Returning single match: '%s'\n", match); return matches; } 有问题: <dahua>dir Directory of flash: 0 -rw- 2423 Jan 01 1970 00:01:51 1.txt 1 drw- - Jan 01 1970 00:04:56 cfg 2 drw- - Jan 01 1970 00:00:54 etc 3 drw- - Jan 01 1970 00:01:26 home 4 drw- - Jan 01 1970 00:15:14 installers 5 drw- - Jan 01 1970 00:04:55 logfile 436148 KB total (340600 KB free) <dahua>dir e【cmlsh_completion】text = e [MATCHES] Called with text='e' [COMP] Entering: text='e', state=0, last_state=-1, path_text = Parsed: base='', prefix='', level=0 [generate_current_dir_paths] Scanning directory: '.' with prefix '' [generate_current_dir_paths] Found 7 matches Next index: 0/6, match='1.txt' [COMP] Returning: '1.txt' [MATCHES] Returning single match: '1.txt' 1.txt【cmlsh_completion】text = 1.txt [MATCHES] Called with text='1.txt' [COMP] Entering: text='1.txt', state=0, last_state=0, path_text = Next index: 1/6, match='cfg/' [COMP] Returning: 'cfg/' [MATCHES] Returning single match: 'cfg/' cfg/【cmlsh_completion】text = cfg/ [MATCHES] Called with text='cfg/' [COMP] Entering: text='cfg/', state=0, last_state=1, path_text = Next index: 2/6, match='etc/' [COMP] Returning: 'etc/' [MATCHES] Returning single match: 'etc/' etc/【cmlsh_completion】text = etc/ [MATCHES] Called with text='etc/' [COMP] Entering: text='etc/', state=0, last_state=2, path_text = Next index: 3/6, match='home/' [COMP] Returning: 'home/' [MATCHES] Returning single match: 'home/' home/【cmlsh_completion】text = home/ 我输入dir e,tab键后完没有补路径 好像是parse_input_text有问题 我的要求是我输入dir e后,连续tab键,效果如下: 1、 第一次tab:dir etc/ 第二次tab:dir etc/CML_DB.db 第三次tab:dir etc/nos.conf 第四次tab:dir etc/nos.conf.bak 第五次tab:dir etc/ssh/ 第六次tab:dir etc/ 第七次tab:dir etc/CML_DB.db 2、 要显示dir etc/ssh/下的文件,除非是我输入的是dir etc/s,然后连续tab键补才是: 第一次tab:dir etc/ssh/ 第二次tab:dir etc/ssh/ssh_host_rsa_key.pub 第三次tab:dir etc/ssh/ssh_host_ecdsa_key.pub 第四次tab:dir etc/ssh/ssh_host_rsa_key 第五次tab:dir etc/ssh/ssh_host_ed25519_key.pub 第六次tab:dir etc/ssh/ssh_host_ed25519_key 第七次tab:dir etc/ssh/ssh_host_ecdsa_key 第八次tab:dir etc/ssh/ 要这样的循环显示 就是每次tab只会显示一行
最新发布
08-01
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值