【问题标题】:Scraping data with jsoup使用 jsoup 抓取数据
【发布时间】:2020-02-15 19:29:41
【问题描述】:

所以我正在使用 jsoup 库抓取一些数据。数据组织在 html 元素表中。我想在 textView 或 listView 中显示相关数据。对于初学者,它需要在 textView 中。当我尝试显示来自多个表的信息时,textView 只显示第一个表。我不能把我的想法包裹在这件事上。希望你能帮助我指出我做错了什么。

这里是代码


    TextView textView;
    Button dohvatiStranicu;

    @Override
    protected void onCreate(Bundle savedInstanceState) {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.activity_main);

        textView = (TextView)findViewById(R.id.textView);
        textView.setMovementMethod(new ScrollingMovementMethod());
        dohvatiStranicu = (Button)findViewById(R.id.getPageButton);


        dohvatiStranicu.setOnClickListener(new View.OnClickListener() {
            @Override
            public void onClick(View v) {
                new dohvatiStranicu().execute();
            }
        });


    }

    public class dohvatiStranicu extends AsyncTask<Void,Void,Void>{

        StringBuilder stringBuilder;

        @Override
        protected void onPreExecute() {
            stringBuilder = new StringBuilder();
        }

        @Override
        protected Void doInBackground(Void... voids) {

            try{
                Document doc = Jsoup.connect("https://inf.ffzg.unizg.hr/index.php/hr/studij/diplomski-studij/ispitni-rokovi?fbclid=IwAR0WuLXdooI_0wB8-vVbgZTs89jX-B0eNY0f4wmB9rScqojSqsA2oN-aQ6I").get();
                Elements tables = doc.select("table");
                for(Element table : tables){
                    stringBuilder.append("\n\n\n");
                    stringBuilder.append(parsirajTablicu(table));
                }

            }catch(Exception e){
                e.printStackTrace();
            }

            return null;
        }

        @Override
        protected void onPostExecute(Void aVoid) {
            super.onPostExecute(aVoid);

            textView.setText(stringBuilder);

        }
    }

    private static String parsirajTablicu(Element table){
        String text = "\n\n\n\n\n";

        Element nazivPredmeta = table.selectFirst("p");

        Elements naziviRokova = table.select("th");
        Elements datumiRokova = table.select("td");
        datumiRokova.remove(0);

        text += nazivPredmeta.text()+ "\n\n";
        text += naziviRokova.get(0).text() + "\n";
        text += "    " + datumiRokova.get(0).text() + "\n";
        text += "    " + datumiRokova.get(4).text() + "\n";
        text += "    " + datumiRokova.get(8).text() + "\n";

        text += naziviRokova.get(1).text() + "\n";
        text += "    " + datumiRokova.get(1).text() + "\n";
        text += "    " + datumiRokova.get(5).text() + "\n";
        text += "    " + datumiRokova.get(9).text() + "\n";

        text += naziviRokova.get(2).text() + "\n";
        text += "    " + datumiRokova.get(2).text() + "\n";
        text += "    " + datumiRokova.get(6).text() + "\n";
        text += "    " + datumiRokova.get(10).text() + "\n";

        text += naziviRokova.get(3).text() + "\n";
        text += "    " + datumiRokova.get(3).text() + "\n";
        text += "    " + datumiRokova.get(7).text() + "\n";
        text += "    " + datumiRokova.get(11).text() + "\n";

        return text;
    }
}

【问题讨论】:

    标签: java android jsoup


    【解决方案1】:

    第二个表没有“th”元素,则naziviRokova 为空,nazivPredmeta.text() 生成NullPointerException

    您还必须小心datumiRokova.get(i),因为如果没有足够的元素,它将导致IndexOutOfBoundsException

    作为一般建议,尝试隔离“逻辑”和“android”集成代码,这将使您能够更有效地测试代码。

    完整示例:

    package org.example;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    public class SO_60092082 {
    
        private static String parsirajTablicu(Element table){
            String text = "\n\n\n\n\n";
    
            Element nazivPredmeta = table.selectFirst("p");
    
            Elements naziviRokova = table.select("th");
            Elements datumiRokova = table.select("td");
            datumiRokova.remove(0);
    
            if (nazivPredmeta!=null)
                text += nazivPredmeta.text()+ "\n\n";
    
            text += safeGetText(naziviRokova, 0) + "\n";
            text += "    " + safeGetText(datumiRokova, 0) + "\n";
            text += "    " + safeGetText(datumiRokova, 4) + "\n";
            text += "    " + safeGetText(datumiRokova, 8) + "\n";
    
            text += safeGetText(naziviRokova, 1) + "\n";
            text += "    " + safeGetText(datumiRokova, 1) + "\n";
            text += "    " + safeGetText(datumiRokova, 5) + "\n";
            text += "    " + safeGetText(datumiRokova, 9) + "\n";
    
            text += safeGetText(naziviRokova, 2) + "\n";
            text += "    " + safeGetText(datumiRokova, 2) + "\n";
            text += "    " + safeGetText(datumiRokova, 6) + "\n";
            text += "    " + safeGetText(datumiRokova, 10) + "\n";
    
            text += safeGetText(naziviRokova, 3) + "\n";
            text += "    " + safeGetText(datumiRokova, 3) + "\n";
            text += "    " + safeGetText(datumiRokova, 7) + "\n";
            text += "    " + safeGetText(datumiRokova, 11) + "\n";
    
            return text;
        }
    
        private static String safeGetText(Elements datumiRokova, int i) {
            return datumiRokova.size()>i ? datumiRokova.get(i).text() : "";
        }
    
        public static void main(String[] args) {
    
            try{
                StringBuilder stringBuilder = new StringBuilder();
                Document doc =  Jsoup.connect("https://inf.ffzg.unizg.hr/index.php/hr/studij/diplomski-studij/ispitni-rokovi?fbclid=IwAR0WuLXdooI_0wB8-vVbgZTs89jX-B0eNY0f4wmB9rScqojSqsA2oN-aQ6I").get();
                Elements tables = doc.select("table");
                for(Element table : tables){
                    stringBuilder.append("\n\n\n");
                    stringBuilder.append(parsirajTablicu(table));
                }
    
                System.out.println(stringBuilder);
            }catch(Exception e){
                e.printStackTrace();
            }
        }
    }
    

    【讨论】:

      猜你喜欢
      • 2014-05-13
      • 1970-01-01
      • 2021-07-24
      • 2013-01-26
      • 2019-04-14
      • 2013-06-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多