【问题标题】:ValueError: feature_names mismatch: training data did not have the following fields: 0ValueError:特征名称不匹配:训练数据没有以下字段:0
【发布时间】:2022-01-14 15:51:17
【问题描述】:

我收到以下错误

ValueError: feature_names mismatch: ['Have_IP', 'Have_At', 'URL_Length', 'URL_Depth', 'Redirection', 'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic' , 'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over', 'Right_Click', 'Web_Forwards'] ['0']

输入数据中的预期 https_Domain、DNS_Record、Domain_Age、URL_Depth、Have_At、iFrame、Have_IP、TinyURL、重定向、Web_Traffic、Web_Forwards、Mouse_Over、前缀/后缀、Right_Click、URL_Length、Domain_End 训练数据没有以下字段:0

    self.label_2 = QtWidgets.QLabel(self.centralwidget)
    self.label_2.setGeometry(QtCore.QRect(320, 260, 191, 51))
    font = QtGui.QFont()
    font.setPointSize(16)
    self.label_2.setFont(font)
    self.label_2.setStyleSheet("color: rgb(255, 0, 0);")
    self.label_2.setObjectName("label_2")
    self.textEdit = QtWidgets.QTextEdit(self.centralwidget)
    self.textEdit.setGeometry(QtCore.QRect(250, 140, 281, 31))
    self.textEdit.setObjectName("textEdit")
    self.pushButton = QtWidgets.QPushButton(self.centralwidget)
    self.pushButton.setGeometry(QtCore.QRect(360, 210, 80, 25))
    self.pushButton.setObjectName("pushButton")
    self.pushButton.clicked.connect(self.UrlfeatureExtraction)
    self.label_3 = QtWidgets.QLabel(self.centralwidget)
    self.label_3.setGeometry(QtCore.QRect(120, 150, 91, 16))
    self.label_3.setObjectName("label_3")
    MainWindow.setCentralWidget(self.centralwidget)
    self.menubar = QtWidgets.QMenuBar(MainWindow)
    self.menubar.setGeometry(QtCore.QRect(0, 0, 800, 30))
    self.menubar.setObjectName("menubar")
    MainWindow.setMenuBar(self.menubar)
    self.statusbar = QtWidgets.QStatusBar(MainWindow)
    self.statusbar.setObjectName("statusbar")
    MainWindow.setStatusBar(self.statusbar)

    self.retranslateUi(MainWindow)
    QtCore.QMetaObject.connectSlotsByName(MainWindow)

def retranslateUi(self, MainWindow):
    _translate = QtCore.QCoreApplication.translate
    MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
    self.label.setText(_translate("MainWindow", "    PHISHING WEBSITE DETECTION"))
    self.pushButton.setText(_translate("MainWindow", "DETECT"))
    self.label_2.setText(_translate("MainWindow", "ANSWER IS:-"))
    self.label_3.setText(_translate("MainWindow", "ENTER URL"))



# 1.Domain of the URL (Domain)
def getDomain(self,url):
    domain = urlparse(url).netloc
    if re.match(r"^www.", domain):
        domain = domain.replace("www.", "")
    return domain

# 2.Checks for IP address in URL (Have_IP)
def havingIP(self,url):
    try:
        ipaddress.ip_address(url)
        ip = 1
    except:
        ip = 0
    return ip

# 3.Checks the presence of @ in URL (Have_At)
def haveAtSign(self,url):
    if "@" in url:
        at = 1
    else:
        at = 0
    return at

# 4.Finding the length of URL and categorizing (URL_Length)
def getLength(self,url):
    if len(url) < 54:
        length = 0
    else:
        length = 1
    return length

# 5.Gives number of '/' in URL (URL_Depth)
def getDepth(self,url):
    s = urlparse(url).path.split('/')
    depth = 0
    for j in range(len(s)):
        if len(s[j]) != 0:
            depth = depth + 1
    return depth

# 6.Checking for redirection '//' in the url (Redirection)
def redirection(self,url):
    pos = url.rfind('//')
    if pos > 6:
        if pos > 7:
            return 1
        else:
            return 0
    else:
        return 0


# 7.Existence of “HTTPS” Token in the Domain Part of the URL (https_Domain)
def httpDomain(self,url):
    domain = urlparse(url).netloc
    if 'https' in domain:
        return 1
    else:
        return 0

# listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                    r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                    r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                    r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                    r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                    r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                    r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                    r"tr\.im|link\.zip\.net"


# 8. Checking for Shortening Services in URL (Tiny_URL)
def tinyURL(self,url):
    match = re.search(self.shortening_services, url)
    if match:
        return 1
    else:
        return 0


# 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
def prefixSuffix(self,url):
    if '-' in urlparse(url).netloc:
        return 1  # phishing
    else:
        return 0  # legitimate


def get_ipython(self):
    pass
    self.get_ipython().system('pip install python-whois')


# 12.Web traffic (Web_Traffic)
def web_traffic(self,url):
    try:
        # Filling the whitespaces in the URL if any
        url = urllib.parse.quote(url)
        rank = \
            BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(),
                        "xml").find(
                "REACH")['RANK']
        rank = int(rank)
    except TypeError:
        return 1
    if rank < 100000:
        return 1
    else:
        return 0

# 13.Survival time of domain: The difference between termination time and creation time (Domain_Age)
def domainAge(self,domain_name):
    creation_date = domain_name.creation_date
    expiration_date = domain_name.expiration_date
    if (isinstance(creation_date, str) or isinstance(expiration_date, str)):
        try:
            creation_date = datetime.strptime(creation_date, '%Y-%m-%d')
            expiration_date = datetime.strptime(expiration_date, "%Y-%m-%d")
        except:
            return 1
    if ((expiration_date is None) or (creation_date is None)):
        return 1
    elif ((type(expiration_date) is list) or (type(creation_date) is list)):
        return 1
    else:
        ageofdomain = abs((expiration_date - creation_date).days)
        if ((ageofdomain / 30) < 6):
            age = 1
        else:
            age = 0
    return age

# 14.End time of domain: The difference between termination time and current time (Domain_End)
def domainEnd(self,domain_name):
    expiration_date = domain_name.expiration_date
    if isinstance(expiration_date, str):
        try:
            expiration_date = datetime.strptime(expiration_date, "%Y-%m-%d")
        except:
            return 1
    if (expiration_date is None):
        return 1
    elif (type(expiration_date) is list):
        return 1
    else:
        today = datetime.now()
        end = abs((expiration_date - today).days)
        if ((end / 30) < 6):
            end = 0
        else:
            end = 1
    return end

# 15. IFrame Redirection (iFrame)
def iframe(self,response):
    if response == "":
        return 1
    else:
        if re.findall(r"[<iframe>|<frameBorder>]", response.text):
            return 0
        else:
            return 1

# 16.Checks the effect of mouse over on status bar (Mouse_Over)
def mouseOver(self,response):
    if response == "":
        return 1
    else:
        if re.findall("<script>.+onmouseover.+</script>", response.text):
            return 1
        else:
            return 0

# 17.Checks the status of the right click attribute (Right_Click)
def rightClick(self,response):
    if response == "":
        return 1
    else:
        if re.findall(r"event.button ?== ?2", response.text):
            return 0
        else:
            return 1


# 18.Checks the number of forwardings (Web_Forwards)
def forwarding(self,response):
    if response == "":
        return 1
    else:
        if len(response.history) <= 2:
            return 0
        else:
            return 1

# Function to extract features
# There are 17 features extracted from the dataset
def featureExtractions(self,url):
    self.getDomain(url)
    features=[]
    features = [self.havingIP(url), self.haveAtSign(url),self.getLength(url),self.getDepth(url),self.redirection(url),
                self.httpDomain(url),self.tinyURL(url),self.prefixSuffix(url)]
    # Address bar based features (9)

    # Domain based features (4)
    dns = 0
    try:
        domain_name = whois.whois(urlparse(url).netloc)
    except:
        dns = 1

    features.append(dns)
    features.append(self.web_traffic(url))
    features.append(1 if dns == 1 else self.domainAge(domain_name))
    features.append(1 if dns == 1 else self.domainEnd(domain_name))

    # HTML & Javascript based features (4)
    try:
        response = requests.get(url)
    except:
        response = ""
    features.append(self.iframe(response))
    features.append(self.mouseOver(response))
    features.append(self.rightClick(response))
    features.append(self.forwarding(response))
    # label=0
    # features.append(label)

    book = xlsxwriter.Workbook('Extracted_data.xlsx')     
    sheet1 = book.add_worksheet()
        
    feature_names = ['Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
                'https_Domain', 'TinyURL','Prefix/Suffix', 'DNS_Record','Web_Traffic', 'Domain_Age', 'Domain_End',
                'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards']
        
    row = 0    
    column = 0 

    for item in feature_names :     
            # write operation perform     
        sheet1.write(row, column, item)      
        # incrementing the value of row by one with each iterations.     
        column += 1

    row = 1    
    column = 0

    for item in features :     
            # write operation perform     
        sheet1.write(row, column, item)      
            # incrementing the value of row by one with each iterations.     
        column += 1

    book.close()
    return features 



def UrlfeatureExtraction(self):
    url = self.textEdit.toPlainText()
    data=[]
    label=0
    data=self.featureExtractions(url)
    # data.pop()
    XGmodel = pickle.load(open('XGBoostClassifier.pkl', 'rb'))
    cols_when_model_builds = XGmodel.get_booster().feature_names
    print(data)
    dfa=pd.DataFrame(data)
    y_pred = XGmodel.predict(dfa)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    if predictions == 0:    
        value = "Legitimate"
        self.label.setText(value)
    else:
        value = "Phishing"
        self.label.setText(value);  
    

如果 name == "ma​​in": 导入系统 应用程序 = QtWidgets.QApplication(sys.argv) MainWindow = QtWidgets.QMainWindow() ui = Ui_MainWindow() ui.setupUi(主窗口) MainWindow.show() sys.exit(app.exec_())

我的 Xtrain 和 Xtest 包含 16 个特征,我已将其传递给模型的输入数据。我什至匹配了它的序列和没有特征。 我无法理解这个错误是什么...... 请帮我解决这个问题

【问题讨论】:

  • 请阅读ML标签的description
  • 请确保您是正确的formatting code。另请注意,您不应该编辑 pyuic 文件;在有关using Designer 的官方指南中阅读如何正确使用这些文件。
  • 您的错误消息何时结束而代码开始?请整理您的代码和错误信息
  • 请修剪您的代码,以便更容易找到您的问题。请按照以下指南创建minimal reproducible example

标签: python dataframe machine-learning pyqt5 xgboost


【解决方案1】:

您有 16 个特征名称和 17 个特征。

【讨论】:

  • 我的错请忽略评论中的 17,如果你数数你会发现它是 16...之前有 17 个,因为你可以在该评论中读到,但它只期望 16 个功能,所以我有删除它,加上最后一个是自变量,所以我没有接受它
猜你喜欢
  • 2019-03-18
  • 1970-01-01
  • 2016-05-06
  • 2018-08-25
  • 2022-01-19
  • 2023-02-13
  • 2019-08-03
  • 2022-11-27
  • 2020-03-15
相关资源
最近更新 更多