xgboost_checkpoint / features.py
th1enq's picture
Upload features.py with huggingface_hub
07693f5 verified
"""
from bs4 import BeautifulSoup
with open("mini_dataset/6.html") as f:
test = f.read()
soup = BeautifulSoup(test, "html.parser")
"""
# has_title
def has_title(soup):
if soup.title is None:
return 0
if len(soup.title.text) > 0:
return 1
else:
return 0
# has_input
def has_input(soup):
if len(soup.find_all("input")):
return 1
else:
return 0
# has_button
def has_button(soup):
if len(soup.find_all("button")) > 0:
return 1
else:
return 0
# has_image
def has_image(soup):
if len(soup.find_all("image")) == 0:
return 0
else:
return 1
# has_submit
def has_submit(soup):
for button in soup.find_all("input"):
if button.get("type") == "submit":
return 1
else:
pass
return 0
# has_link
def has_link(soup):
if len(soup.find_all("link")) > 0:
return 1
else:
return 0
# has_password
def has_password(soup):
for input in soup.find_all("input"):
if (input.get("type") or input.get("name") or input.get("id")) == "password":
return 1
else:
pass
return 0
# has_email_input
def has_email_input(soup):
for input in soup.find_all("input"):
if (input.get("type") or input.get("id") or input.get("name")) == "email":
return 1
else:
pass
return 0
# has_hidden_element
def has_hidden_element(soup):
for input in soup.find_all("input"):
if input.get("type") == "hidden":
return 1
else:
pass
return 0
# has_audio
def has_audio(soup):
if len(soup.find_all("audio")) > 0:
return 1
else:
return 0
# has_video
def has_video(soup):
if len(soup.find_all("video")) > 0:
return 1
else:
return 0
# number_of_inputs
def number_of_inputs(soup):
return len(soup.find_all("input"))
# number_of_buttons
def number_of_buttons(soup):
return len(soup.find_all("button"))
# number_of_images
def number_of_images(soup):
image_tags = len(soup.find_all("image"))
count = 0
for meta in soup.find_all("meta"):
if meta.get("type") or meta.get("name") == "image":
count += 1
return image_tags + count
# number_of_option
def number_of_option(soup):
return len(soup.find_all("option"))
# number_of_list
def number_of_list(soup):
return len(soup.find_all("li"))
# number_of_TH
def number_of_TH(soup):
return len(soup.find_all("th"))
# number_of_TR
def number_of_TR(soup):
return len(soup.find_all("tr"))
# number_of_href
def number_of_href(soup):
count = 0
for link in soup.find_all("link"):
if link.get("href"):
count += 1
return count
# number_of_paragraph
def number_of_paragraph(soup):
return len(soup.find_all("p"))
# number_of_script
def number_of_script(soup):
return len(soup.find_all("script"))
# length_of_title
def length_of_title(soup):
if soup.title == None:
return 0
return len(soup.title.text)
"""
print("has_title --> ", has_title(soup))
print("has_input --> ", has_input(soup))
print("has_button --> ", has_button(soup))
print("has_image --> ", has_image(soup))
print("has_submit --> ", has_submit(soup))
print("has_link --> ", has_link(soup))
print("has_password --> ", has_password(soup))
print("has_email_input --> ", has_email_input(soup))
print("has_hidden_element --> ", has_hidden_element(soup))
print("has_audio --> ", has_audio(soup))
print("has_video --> ", has_video(soup))
print("number_of_inputs --> ", number_of_inputs(soup))
print("number_of_buttons --> ", number_of_buttons(soup))
print("number_of_images --> ", number_of_images(soup))
print("number_of_option --> ", number_of_option(soup))
print("number_of_list --> ", number_of_list(soup))
print("number_of_TH --> ", number_of_TH(soup))
print("number_of_TR --> ", number_of_TR(soup))
print("number_of_href --> ", number_of_href(soup))
print("number_of_paragraph --> ", number_of_paragraph(soup))
print("number_of_script --> ", number_of_script(soup))
print("length_of_title --> ", length_of_title(soup))
"""
# has h1
def has_h1(soup):
if len(soup.find_all("h1")) > 0:
return 1
else:
return 0
# has h2
def has_h2(soup):
if len(soup.find_all("h2")) > 0:
return 1
else:
return 0
# has h3
def has_h3(soup):
if len(soup.find_all("h3")) > 0:
return 1
else:
return 0
# length of text
def length_of_text(soup):
return len(soup.get_text())
# number of clickable button
def number_of_clickable_button(soup):
count = 0
for button in soup.find_all("button"):
if button.get("type") == "button":
count += 1
return count
# number of a
def number_of_a(soup):
return len(soup.find_all("a"))
# number of img
def number_of_img(soup):
return len(soup.find_all("img"))
# number of div class
def number_of_div(soup):
return len(soup.find_all("div"))
# number of figures
def number_of_figure(soup):
return len(soup.find_all("figure"))
# has footer
def has_footer(soup):
if len(soup.find_all("footer")) > 0:
return 1
else:
return 0
# has form
def has_form(soup):
if len(soup.find_all("form")) > 0:
return 1
else:
return 0
# has textarea
def has_text_area(soup):
if len(soup.find_all("textarea")) > 0:
return 1
else:
return 0
# has iframe
def has_iframe(soup):
if len(soup.find_all("iframe")) > 0:
return 1
else:
return 0
# has text input
def has_text_input(soup):
for input in soup.find_all("input"):
if input.get("type") == "text":
return 1
return 0
# number of meta
def number_of_meta(soup):
return len(soup.find_all("meta"))
# has nav
def has_nav(soup):
if len(soup.find_all("nav")) > 0:
return 1
else:
return 0
# has object
def has_object(soup):
if len(soup.find_all("object")) > 0:
return 1
else:
return 0
# has picture
def has_picture(soup):
if len(soup.find_all("picture")) > 0:
return 1
else:
return 0
# number of sources
def number_of_sources(soup):
return len(soup.find_all("source"))
# number of span
def number_of_span(soup):
return len(soup.find_all("span"))
# number of table
def number_of_table(soup):
return len(soup.find_all("table"))