@article{Harth2010:Visnav, added-at = {2025-02-12T19:03:36.000+0100}, author = {Harth, Andreas}, bdsk-url-1 = {http://www.sciencedirect.com/science/article/pii/S1570826810000600}, bdsk-url-2 = {http://dx.doi.org/10.1016/j.websem.2010.08.001}, biburl = {https://www.bibsonomy.org/bibtex/27bf3635136dd16bd4c8c1fa8665b3c07/aksw}, doi = {http://dx.doi.org/10.1016/j.websem.2010.08.001}, interhash = {761bf01626aba910d7c5250d9830b54e}, intrahash = {7bf3635136dd16bd4c8c1fa8665b3c07}, issn = {1570-8268}, journal = {Web Semantics: Science, Services and Agents on the World Wide Web}, keywords = {Web data}, note = {Semantic Web Challenge 2009 User Interaction in Semantic Web research}, number = 4, pages = {348--354}, timestamp = {2025-02-12T19:03:36.000+0100}, title = {{VisiNav}: A system for visual search and navigation on web data}, url = {http://www.sciencedirect.com/science/article/pii/S1570826810000600}, volume = 8, year = 2010 } @article{linkedspending, abstract = {There is a high public demand to increase transparency in government spending. Open spending data has the power to reduce corruption by increasing accountability and strengthens democracy because voters can make better informed decisions. An informed and trusting public also strengthens the government itself because it is more likely to commit to large projects. OpenSpending.org is a an open platform that provides public finance data from governments around the world. In this article, we present its RDF conversion LinkedSpending which provides more than five million planned and carried out financial transactions in 627 datasets from all over the world from 2005 to 2035 as Linked Open Data. This data is represented in the RDF Data Cube vocabulary and is freely available and openly licensed.}, added-at = {2025-02-12T19:03:35.000+0100}, author = {H{\"o}ffner, Konrad and Martin, Michael and Lehmann, Jens}, bdsk-url-1 = {http://www.semantic-web-journal.net/system/files/swj923.pdf}, biburl = {https://www.bibsonomy.org/bibtex/2f532f2aebeaef4e6b1fbd9e1f993a673/aksw}, doi = {10.3233/SW-150172}, interhash = {5af8fb87364343cc36e59e5461ac2b16}, intrahash = {f532f2aebeaef4e6b1fbd9e1f993a673}, journal = {Semantic Web Journal}, keywords = {Data MOLE Open OpenSpending RDF SIMBA budget expenditure finance group_aksw hoeffner lehmann linkedspending martin public semantic transparency web}, timestamp = {2025-02-12T19:03:35.000+0100}, title = {{LinkedSpending}: {OpenSpending} becomes {Linked Open Data}}, url = {http://www.semantic-web-journal.net/system/files/swj923.pdf}, year = 2015 } @article{torkeyisultan2013adaptive, abstract = {The Competition between different Web Service Providers to enhance their services and to increase the users' usage of their provided services raises the idea of our research. Our research is focusing on increasing the number of services that User or Developer will use. We proposed a web service’s recommendation model by applying the data mining techniques like Apriori algorithm to suggest another web service beside the one he got from the discovery process based on the user’s History. For implementing our model, we used a curated source for web services and users, which also contains a complete information about users and their web services usage. We found a BioCatalogue: our proposed model was tested on a Curated Web Service Registry (BioCatalogue).and 70 % of users chose services from services that recommended by our model besides the discovered ones by BioCatalogue. }, added-at = {2024-11-29T11:00:28.000+0100}, author = {TorkeyI.Sultan and Khedr, Ayman E. and Alsheref, Fahad Kamal}, biburl = {https://www.bibsonomy.org/bibtex/2916c697a2c6e89badc2bb485a356407a/ijwsc}, doi = {10.5121/ijwsc.2013.4403}, interhash = {b608572c75c5ae63707441620095e77f}, intrahash = {916c697a2c6e89badc2bb485a356407a}, issn = {0976 - 9811 (Online); 2230 - 7702 (Print)}, journal = {International Journal on Web Service Computing (IJWSC)}, keywords = {BioCatalogue Data Mining Recommendation System Web and discovery services}, language = {English}, month = {December}, number = 4, pages = {21-33}, timestamp = {2024-11-29T11:00:28.000+0100}, title = {ADAPTIVE MODEL FOR WEB SERVICE RECOMMENDATION}, url = {https://airccse.org/journal/jwsc/papers/4413ijwsc03.pdf}, volume = 4, year = 2013 } @article{noauthororeditor, abstract = {This paper shows that the problem of web services representation is crucial and analyzes the various factors that influence on it. It presents the traditional representation of web services considering traditional textual descriptions based on the information contained in WSDL files. Unfortunately, textual web services descriptions are dirty and need significant cleaning to keep only useful information. To deal with this problem, we introduce rules based text tagging method, which allows filtering web service description to keep only significant information. A new representation based on such filtered data is then introduced. Many web services have empty descriptions. Also, we consider web services representations based on the WSDL file structure (types, attributes, etc.). Alternatively, we introduce a new representation called symbolic reputation, which is computed from relationships between web services. The impact of the use of these representations on web service discovery and recommendation is studied and discussed in the experimentation using real world web services.}, added-at = {2024-10-04T09:05:25.000+0200}, author = {AZNAG, Mustapha and QUAFAFOU, Mohamed and DURAND, Nicolas and JARIR, Zahi}, biburl = {https://www.bibsonomy.org/bibtex/2b5ac5fa2035c27b86a31744253e4e5b1/ijwsc}, doi = {10.5121/ijwsc.2013.4101}, interhash = {99a2512a93a62a7560e5567491fe03a4}, intrahash = {b5ac5fa2035c27b86a31744253e4e5b1}, issn = {0976 - 9811 (Online); 2230 - 7702 (Print)}, journal = {International Journal on Web Service Computing (IJWSC)}, keywords = {WSDL Web and data discovery extraction file information recommendation representation reputation semantic services symbolic tagging}, language = {English}, month = {March}, number = 1, pages = {01-8}, timestamp = {2024-10-04T09:05:25.000+0200}, title = {WEB SERVICES DISCOVERY AND RECOMMENDATION BASED ON INFORMATION EXTRACTION AND SYMBOLIC REPUTATION}, url = {https://airccse.org/journal/jwsc/papers/4113ijwsc01.pdf}, volume = 4, year = 2013 } @article{makhlughian2012service, abstract = {With the explosive growth of the number of services published over the Internet, it is difficult to select satisfactory web services among the candidate web services which provide similar functionalities. Quality of Service (QoS) is considered as the most important non-functional criterion for service selection. But this criterion is no longer considered as the only criterion to rank web services, satisfying user’s preferences. The similarity measure (outputs–inputs similarity) between concepts based on ontology in an interconnected network of semantic Web services involved in a composition can be used as a distinguishing criterion to estimate the semantic quality of selected services for the composite service. Coupling the semantic similarity as the functional aspect and quality of services allows us to further constrain and select services for the valid composite services. In this paper, we present an overall service selection and ranking framework which firstly classify candidate web services to different QoS levels respect to user’s QoS requirements and preferences with an Associative Classification algorithm and then rank the most qualified candidate services based on their functional quality through semantic matching. The experimental results show that proposed framework can satisfy service requesters’ non-functional requirements. }, added-at = {2024-05-31T10:41:41.000+0200}, author = {Makhlughian, Molood and Hashemi, Seyyed Mohsen and Rastegari, Yousef and Pejman, Emad}, biburl = {https://www.bibsonomy.org/bibtex/2c50e01f8bb8971eae2d9dcf0353a176c/ijwsc}, doi = {10.5121/ijwsc.2012.3101}, ee = {https://doi.org/10.1016/j.scico.2018.09.002}, interhash = {44527917cfc5009633dcf4d3b5e8b851}, intrahash = {c50e01f8bb8971eae2d9dcf0353a176c}, issn = {0976 - 9811 (Online); 2230 - 7702 (Print)}, journal = {International Journal on Web Service Computing (IJWSC)}, keywords = {& (QoS) Classification Data Mining Quality Selection Semantic Service Services Web of}, language = {English}, month = {March}, number = 1, pages = {01-14}, timestamp = {2024-05-31T10:41:41.000+0200}, title = {WEB SERVICE SELECTION BASED ON RANKING OF QOS USING ASSOCIATIVE CLASSIFICATION}, url = {https://airccse.org/journal/jwsc/papers/3112ijwsc01.pdf}, volume = 3, year = 2012 } @article{noauthororeditor, abstract = {This paper examines next generation approaches to transform enterprise data archiving methodologies for the burgeoning digital age. We analyze the limitations of traditional archiving and the innovations needed to effectively manage massive growth in enterprise data volumes. Detailed sections are included covering the following aspects:• The exponential growth in enterprise data and ensuing archiving challenges• Intelligent policy-based automation to enable smarter archiving.• Cloud-native architectures for highly scalable archives• Persistent metadata synchronization for greater archived data utility • Comparison of on-premises, cloud, and hybrid archiving models• Archiving best practices related to security, retention, discovery etc. • Case studies of real-world archiving implementations• Recommendations for a modern holistic enterprise archiving strategyAdvanced data archiving techniques such as machine learning policies, cloud repositories, active metadata, and unified cross-platform access are imperative today to control data sprawl, accelerate insights, and ensure regulatory compliance. By reimagining archiving for the digital age, enterprises can cost-efficiently extract maximum value from data while minimizing risks.}, added-at = {2024-04-25T12:50:52.000+0200}, author = {v Smirnov |, Sergi}, biburl = {https://www.bibsonomy.org/bibtex/2dfcdc96ed9e378a8cdb42a3d8ce09d15/centralasian_20}, interhash = {782b5485df79a118ac70c3498482be4a}, intrahash = {dfcdc96ed9e378a8cdb42a3d8ce09d15}, issn = {2660-5309}, journal = {CENTRAL ASIAN JOURNAL OF MATHEMATICAL THEORY AND COMPUTER SCIENCES}, keywords = {Archiving, Data Digitalization, Enterprise Management Systems Web application,}, language = {english}, month = nov, number = 11, pages = {55-60}, timestamp = {2024-04-25T12:50:52.000+0200}, title = {REINVENTING ENTERPRISE DATA ARCHIVING FOR THE DIGITAL ERA}, url = {https://cajmtcs.centralasianstudies.org/index.php/CAJMTCS/article/view/552/594}, volume = 4, year = 2023 } @article{journals/amco/WangZ14, abstract = {Creating a quick and effective page ranking system for web crawling and retrieval is still a difficult problem. We suggest constructing a set of PageRank vectors biased using a collection of representative subjects in order to better capture the idea of relevance with regard to a certainty of topic in order to produce more accurate for search results. The outcome of the experiment demonstrates that the suggested algorithm improves the degree of relevance compared to the original one and reduces the topic sensitive PageRanks query time efforts. This paper offers an overview of Web mining as well as a review of its various categories. Next, we concentrate on one of these subcategories Web structure mining. In this area, we describe link mining and examine PageRank, two well liked techniques used in web structure mining. Ku Nalesh | Ghanshyam Sahu | Lalit Kumar P Bhaiya "Data Processing in Web Mining Structure by Hyperlinks and Pagerank" Published in International Journal of Trend in Scientific Research and Development (ijtsrd), ISSN: 2456-6470, Volume-7 | Issue-6 , December 2023, URL: https://www.ijtsrd.com/papers/ijtsrd60083.pdf Paper Url: https://www.ijtsrd.com/computer-science/data-miining/60083/data-processing-in-web-mining-structure-by-hyperlinks-and-pagerank/ku-nalesh }, added-at = {2024-01-23T10:11:55.000+0100}, author = {Bhaiya, Ku Nalesh | Ghanshyam Sahu | Lalit Kumar P}, biburl = {https://www.bibsonomy.org/bibtex/208f02c22ab967b2c253a47b4707c9d58/ijtsrd}, ee = {https://doi.org/10.3934/amc.2014.8.83}, interhash = {0969e8c1ece51ba8c03ed20983be9177}, intrahash = {08f02c22ab967b2c253a47b4707c9d58}, issn = {2456-6470}, journal = {INTERNATIONAL JOURNAL OF TREND IN SCIENTIFIC RESEARCH AND DEVELOPMENT}, keywords = {Data Graph, Mining, PageRank, Processing Structure Web}, language = {english}, month = dec, number = 6, pages = {223-228}, timestamp = {2024-01-23T10:11:55.000+0100}, title = {Data Processing in Web Mining Structure by Hyperlinks and Pagerank }, url = {https://www.ijtsrd.com/computer-science/data-miining/60083/data-processing-in-web-mining-structure-by-hyperlinks-and-pagerank/ku-nalesh}, volume = 7, year = 2023 } @article{papadakis2020blocking, added-at = {2023-10-10T09:37:01.000+0200}, author = {Papadakis, George and Skoutas, Dimitrios and Thanos, Emmanouil and Palpanas, Themis}, biburl = {https://www.bibsonomy.org/bibtex/2616692e62536356e5c3b9261eb0e73e7/jaeschke}, description = {Blocking and Filtering Techniques for Entity Resolution: A Survey: ACM Computing Surveys: Vol 53, No 2}, doi = {10.1145/3377455}, interhash = {666d36dbc4509c24c2aa9f2e867046e9}, intrahash = {616692e62536356e5c3b9261eb0e73e7}, journal = {{ACM} Computing Surveys}, keywords = {blocking data entity filtering graph knowledge linked ner open resolution semantic web}, month = mar, number = 2, pages = {1--42}, publisher = {Association for Computing Machinery ({ACM})}, timestamp = {2023-10-10T09:37:01.000+0200}, title = {Blocking and Filtering Techniques for Entity Resolution}, url = {https://doi.org/10.1145%2F3377455}, volume = 53, year = 2020 } @misc{hofer2023construction, abstract = {With knowledge graphs (KGs) at the center of numerous applications such as recommender systems and question answering, the need for generalized pipelines to construct and continuously update such KGs is increasing. While the individual steps that are necessary to create KGs from unstructured (e.g. text) and structured data sources (e.g. databases) are mostly well-researched for their one-shot execution, their adoption for incremental KG updates and the interplay of the individual steps have hardly been investigated in a systematic manner so far. In this work, we first discuss the main graph models for KGs and introduce the major requirement for future KG construction pipelines. Next, we provide an overview of the necessary steps to build high-quality KGs, including cross-cutting topics such as metadata management, ontology development, and quality assurance. We then evaluate the state of the art of KG construction w.r.t the introduced requirements for specific popular KGs as well as some recent tools and strategies for KG construction. Finally, we identify areas in need of further research and improvement.}, added-at = {2023-10-10T09:21:36.000+0200}, author = {Hofer, Marvin and Obraczka, Daniel and Saeedi, Alieh and Köpcke, Hanna and Rahm, Erhard}, biburl = {https://www.bibsonomy.org/bibtex/2592c52c278ad9b6fbd608255d016a0ed/jaeschke}, description = {[2302.11509] Construction of Knowledge Graphs: State and Challenges}, interhash = {1c70c8be84477a599ea2bd0e42d84b9d}, intrahash = {592c52c278ad9b6fbd608255d016a0ed}, keywords = {data graph knowledge linked lod open semantic survey web}, note = {cite arxiv:2302.11509Comment: 43 pages, 5 figures, 3 tables}, timestamp = {2023-10-10T09:21:36.000+0200}, title = {Construction of Knowledge Graphs: State and Challenges}, url = {http://arxiv.org/abs/2302.11509}, year = 2023 } @inproceedings{web70, acmid = {3190666}, added-at = {2023-09-19T10:48:39.000+0200}, address = {Republic and Canton of Geneva, Switzerland}, author = {Staab, Steffen and Lehmann, Jens and Verborgh, Ruben}, biburl = {https://www.bibsonomy.org/bibtex/2563bfaa4de490916cb6359a423bf489f/astrupp}, booktitle = {Companion Proceedings of the The Web Conference 2018}, doi = {10.1145/3184558.3190666}, interhash = {be73b9cce03c4fd6eb2069494e32ad82}, intrahash = {563bfaa4de490916cb6359a423bf489f}, isbn = {978-1-4503-5640-4}, keywords = {data rdf schema.org semantic web}, location = {Lyon, France}, numpages = {2}, pages = {885--886}, publisher = {International World Wide Web Conferences Steering Committee}, series = {WWW '18}, timestamp = {2023-09-19T10:48:39.000+0200}, title = {Structured Knowledge on the Web 7.0}, url = {https://doi.org/10.1145/3184558.3190666}, year = 2018 } @inproceedings{10.1145/3578503.3583601, abstract = {Since 2005, Google has been offering a free version of Google Analytics, allowing website owners to access detailed user behavior data. However, while more and more features and tools have been added to the Google measurement suite since then, it is unclear if the free availability of these tools has really helped users to derive actionable insights for their websites. Earlier studies based on a small number of interviews have suggested that users tend to play with the tools as they lack data literacy, but a broader analysis has been missing by now. Our contribution is a large-scale study of Google Analytics implementations to examine what advanced features are used, allowing conclusions to be drawn about the webmasters’ analysis capabilities. In addition, we detail how difficult it has become to conduct such a study due to the arrangements that website owners have to put in place to comply with the GDPR requirements, but also due to the possibility of obfuscation with the latest development of web analytics software.}, added-at = {2023-04-28T09:13:52.000+0200}, address = {New York, NY, USA}, author = {Alby, Tom}, biburl = {https://www.bibsonomy.org/bibtex/2a39285c981ef3682dd32c7d31307bf47/daswesen}, booktitle = {Proceedings of the 15th ACM Web Science Conference 2023}, doi = {10.1145/3578503.3583601}, interhash = {8dee427c02deea12aa5ccef294eb4d4d}, intrahash = {a39285c981ef3682dd32c7d31307bf47}, isbn = {9798400700897}, keywords = {analytics, data gdpr, google literacy, measurement, myown web}, location = {Austin, TX, USA}, numpages = {8}, pages = {304–311}, publisher = {Association for Computing Machinery}, series = {WebSci '23}, timestamp = {2023-04-28T09:13:52.000+0200}, title = {Popular, but Hardly Used: Has Google Analytics Been to the Detriment of Web Analytics?}, url = {https://doi.org/10.1145/3578503.3583601}, year = 2023 } @article{linkedspending, abstract = {There is a high public demand to increase transparency in government spending. Open spending data has the power to reduce corruption by increasing accountability and strengthens democracy because voters can make better informed decisions. An informed and trusting public also strengthens the government itself because it is more likely to commit to large projects. OpenSpending.org is a an open platform that provides public finance data from governments around the world. In this article, we present its RDF conversion LinkedSpending which provides more than five million planned and carried out financial transactions in 627 datasets from all over the world from 2005 to 2035 as Linked Open Data. This data is represented in the RDF Data Cube vocabulary and is freely available and openly licensed.}, added-at = {2023-04-25T16:34:39.000+0200}, author = {H{\"o}ffner, Konrad and Martin, Michael and Lehmann, Jens}, bdsk-url-1 = {http://www.semantic-web-journal.net/system/files/swj923.pdf}, biburl = {https://www.bibsonomy.org/bibtex/2f532f2aebeaef4e6b1fbd9e1f993a673/dice-research}, doi = {10.3233/SW-150172}, interhash = {5af8fb87364343cc36e59e5461ac2b16}, intrahash = {f532f2aebeaef4e6b1fbd9e1f993a673}, journal = {Semantic Web Journal}, keywords = {Data MOLE Open OpenSpending RDF SIMBA budget expenditure finance group\_aksw hoeffner lehmann linkedspending martin public semantic transparency web}, timestamp = {2023-04-25T16:34:39.000+0200}, title = {{LinkedSpending}: {OpenSpending} becomes {Linked Open Data}}, url = {http://www.semantic-web-journal.net/system/files/swj923.pdf}, year = 2015 } @article{noauthororeditor, abstract = {In Databases one of the active research fields is mapping relational databases (RDB) into Resource Description Framework (RDF). An enormous data is kept in the form of relational databases and accessing of data is done in the semantic web. The data stored in RDB is to be efficiently mapped to the semantic web or RDF for data availability to the users. There is a definite need for improvement in technologies for efficient mapping languages from RDB to RDF in semantic web. This paper presents an up-to-date survey of different RDB to RDF mapping languages proposed in recent times. It outlines the main features or characteristics to be considered for efficient mapping in different scenarios. The main objective of this content, pictures identification of limitations existing in the mapping languages. It also enhances the comparisons between each language and helps researchers to propose further better proposals in their future scope of work to improve better mapping techniques.}, added-at = {2023-03-02T11:38:53.000+0100}, author = {V.Sitharamulu and Babu, Dr. B. Raveendra}, biburl = {https://www.bibsonomy.org/bibtex/200ff5944efdf7f54723c75563d64b1ee/devino}, doi = {10.5121/ijfcst.2016.6203}, interhash = {2442ed4fe7bded05fa51a8c4e59312f8}, intrahash = {00ff5944efdf7f54723c75563d64b1ee}, issn = {1839-7662}, journal = {International Journal on Foundations of Computer Science & Technology (IJFCST)}, keywords = {(RDB) Characteristic Comparison Data Mapping RDB RDF Semantic Web bases databases relational to}, language = {ENGLISH}, month = mar, number = 2, pages = 8, timestamp = {2023-03-02T11:38:53.000+0100}, title = {A Review on RDB to RDF Mapping for Semantic Web}, url = {https://wireilla.com/papers/ijfcst/V6N2/6216ijfcst03.pdf}, volume = 6, year = 2016 } @phdthesis{Waitelonis2018_1000084458, added-at = {2022-09-15T14:05:22.000+0200}, author = {Waitelonis, Jörg}, biburl = {https://www.bibsonomy.org/bibtex/200dc9da0ba62db2a642611d9e80aab49/vivienvetter}, doi = {10.5445/IR/1000084458}, interhash = {6158462249bfc9d1f32cdf288d1109c0}, intrahash = {00dc9da0ba62db2a642611d9e80aab49}, keywords = {Data Information Linked Retrieval Semantic Web information_retrieval linked_data semantic_web}, language = {english}, pagetotal = {256}, publisher = {{Karlsruher Institut für Technologie (KIT)}}, school = {Karlsruher Institut für Technologie (KIT)}, timestamp = {2022-09-15T14:05:22.000+0200}, title = {Linked Data Supported Information Retrieval}, year = 2018 } @inproceedings{Zhang_2021, added-at = {2022-07-20T17:14:35.000+0200}, author = {Zhang, Haoxiang and Santos, A{\'{e}}cio and Freire, Juliana}, biburl = {https://www.bibsonomy.org/bibtex/2872aa083df98c63c978b2a3a1b33b67a/jaeschke}, booktitle = {Proceedings of the 30th {ACM} International Conference on Information {\&}amp$\mathsemicolon$ Knowledge Management}, description = {DSDD | Proceedings of the 30th ACM International Conference on Information & Knowledge Management}, doi = {10.1145/3459637.3482427}, interhash = {4b10432c89d6bc8f864d29e8cac15534}, intrahash = {872aa083df98c63c978b2a3a1b33b67a}, keywords = {crawling data dataset discovery unknowndata web}, month = oct, publisher = {{ACM}}, timestamp = {2022-07-20T17:14:35.000+0200}, title = {{DSDD}: Domain-Specific Dataset Discovery on the Web}, url = {https://doi.org/10.1145%2F3459637.3482427}, year = 2021 } @article{noauthororeditor, abstract = {The abundance of web data has made it an utmost important source for Web data mining. Web data mining takes WWW data as input and after analysis and discovery, the output i.e. extracted information is used by an organisation. It helps the organisation in taking simpatico decisions for better survival in future. The objective of this paper is four folds. Firstly this paper gives a basic introduction of Web data mining. Secondly, it explains Web data mining categories, thirdly it discusses Web content mining techniques and tools in brief and finally a comparison between various tools available for Web Content Mining.}, added-at = {2021-09-21T11:07:58.000+0200}, author = {Kaur, Harmeet and Chawla, Sonal}, biburl = {https://www.bibsonomy.org/bibtex/252310e6bd030abe0650c028ef82932cd/ijisme_beiesp}, editor = {Kumar, Dr. Shiv}, interhash = {ab92b14552ab5536d83e497de509ffc3}, intrahash = {52310e6bd030abe0650c028ef82932cd}, issn = {2319-6386}, journal = {International Journal of Innovative Science and Modern Engineering (IJISME)}, keywords = {Content Mining Structured Web data data. unstructured}, language = {En}, month = {December}, number = 1, pages = {34-36}, timestamp = {2021-09-21T11:07:58.000+0200}, title = {Web Data Mining: Exploring Hidden Patterns, its Types and Web Content Mining Techniques and Tools}, url = {https://www.ijisme.org/wp-content/uploads/papers/v3i1/A0769123114.pdf}, volume = 3, year = 2014 } @inproceedings{paris2021evaluating, abstract = {Dataset creation for the purpose of training natural language processing (NLP) algorithms is often accompanied by an uncertainty about how the target concept is represented in the data. Extracting such data from web pages and verifying its quality is a non-trivial task, due to the Web's unstructured and heterogeneous nature and the cost of annotation. In that situation, annotation heuristics can be employed to create a dataset that captures the target concept, but in turn may lead to an unstable downstream performance. On the one hand, a trade-off exists between cost, quality, and magnitude for annotation heuristics in tasks such as classification, leading to fluctuations in trained models' performance. On the other hand, general-purpose NLP tools like BERT are now commonly used to benchmark new models on a range of tasks on static datasets. We utilize this standardization as a means to assess dataset quality, as most applications are dataset specific. In this study, we investigate and evaluate the performance of three annotation heuristics for a classification task on extracted web data using BERT. We present multiple datasets, from which the classifier shall learn to identify web pages that are centered around an individual in the academic domain. In addition, we assess the relationship between the performance of the trained classifier and the training data size. The models are further tested on out-of-domain web pages, to asses the influence of the individuals' occupation and web page domain.}, added-at = {2021-06-17T22:42:26.000+0200}, author = {Paris, Michael and Jäschke, Robert}, biburl = {https://www.bibsonomy.org/bibtex/2438b13e07dca3f091d068b28b5de2225/jaeschke}, booktitle = {Proceedings of the 14th International Conference on Knowledge Science, Engineering and Management}, doi = {10.1007/978-3-030-82147-0_14}, interhash = {723f32bccd871bf86567d727adc126bb}, intrahash = {438b13e07dca3f091d068b28b5de2225}, keywords = {2021 archive bert classification data deeplearning embedding gaw learning machine ml myown network neural regio web}, pages = {1--14}, publisher = {Springer}, series = {Lecture Notes in Artificial Intelligence}, timestamp = {2021-07-05T14:30:59.000+0200}, title = {Evaluating dataset creation heuristics for concept detection in web pages using BERT}, volume = 12816, year = 2021 } @inproceedings{7320414, abstract = {App stores like Google Play and Apple AppStore have over 3 Million apps covering nearly every kind of software and service. Billions of users regularly download, use, and review these apps. Recent studies have shown that reviews written by the users represent a rich source of information for the app vendors and the developers, as they include information about bugs, ideas for new features, or documentation of released features. This paper introduces several probabilistic techniques to classify app reviews into four types: bug reports, feature requests, user experiences, and ratings. For this we use review metadata such as the star rating and the tense, as well as, text classification, natural language processing, and sentiment analysis techniques. We conducted a series of experiments to compare the accuracy of the techniques and compared them with simple string matching. We found that metadata alone results in a poor classification accuracy. When combined with natural language processing, the classification precision got between 70-95% while the recall between 80-90%. Multiple binary classifiers outperformed single multiclass classifiers. Our results impact the design of review analytics tools which help app vendors, developers, and users to deal with the large amount of reviews, filter critical reviews, and assign them to the appropriate stakeholders.}, added-at = {2021-01-21T16:34:31.000+0100}, author = {{Maalej}, W. and {Nabil}, H.}, biburl = {https://www.bibsonomy.org/bibtex/2070e19c0ebb934423df5f22ddb94b5de/parismic}, booktitle = {2015 IEEE 23rd International Requirements Engineering Conference (RE)}, description = {Bug report, feature request, or simply praise? On automatically classifying app reviews - IEEE Conference Publication}, doi = {10.1109/RE.2015.7320414}, interhash = {8694b5ae8f35009357b4537a12a9e76c}, intrahash = {070e19c0ebb934423df5f22ddb94b5de}, issn = {2332-6441}, keywords = {app data review web}, month = aug, pages = {116-125}, timestamp = {2021-01-21T16:34:31.000+0100}, title = {Bug report, feature request, or simply praise? On automatically classifying app reviews}, url = {https://ieeexplore.ieee.org/abstract/document/7320414}, year = 2015 } @inproceedings{7892635, abstract = {The Internet exhibits a gigantic measure of helpful data which is generally designed for its users, which makes it hard to extract applicable information from different sources. Accordingly, the accessibility of strong, adaptable Information Extraction framework that consequently concentrate structured data such as, entities, relationships between entities, and attributes from unstructured or semi-structured sources. But somewhere during extraction of information may lead to the loss of its meaning, which is absolutely not feasible. Semantic Web adds solution to this problem. It is about providing meaning to the data and allow the machine to understand and recognize these augmented data more accurately. The proposed system is about extracting information from research data of IT domain like journals of IEEE, Springer, etc., which aid researchers and the organizations to get the data of journals in an optimized manner so the time and hard work of surfing and reading the entire journal's papers or articles reduces. Also the accuracy of the system is taken care of using RDF, the data extracted has a specific declarative semantics so that the meaning of the research papers or articles during extraction remains unchanged. In addition, the same approach shall be applied on multiple documents, so that time factor can get saved.}, added-at = {2020-09-29T11:04:05.000+0200}, author = {{Gandhi}, K. and {Madia}, N.}, biburl = {https://www.bibsonomy.org/bibtex/2edb0fa0fa49fd02ab82c0a5dcbf91a0f/parismic}, booktitle = {2016 International Conference on ICT in Business Industry Government (ICTBIG)}, description = {Information extraction from unstructured data using RDF - IEEE Conference Publication}, doi = {10.1109/ICTBIG.2016.7892635}, interhash = {05a5bdb8144d946312f187fcc455973d}, intrahash = {edb0fa0fa49fd02ab82c0a5dcbf91a0f}, keywords = {data structured unstructured web}, month = nov, pages = {1-6}, timestamp = {2020-09-29T11:04:05.000+0200}, title = {Information extraction from unstructured data using RDF}, url = {https://ieeexplore.ieee.org/abstract/document/7892635}, year = 2016 } @misc{roig2020unsupervised, abstract = {This paper presents a system towards the generation of multi-label datasets from web data in an unsupervised manner. To achieve this objective, this work comprises two main contributions, namely: a) the generation of a low-noise unsupervised single-label dataset from web-data, and b) the augmentation of labels in such dataset (from single label to multi label). The generation of a single-label dataset uses an unsupervised noise reduction phase (clustering and selection of clusters using anchors) obtaining a 85% of correctly labeled images. An unsupervised label augmentation process is then performed to assign new labels to the images in the dataset using the class activation maps and the uncertainty associated with each class. This process is applied to the dataset generated in this paper and a public dataset (Places365) achieving a 9.5% and 27% of extra labels in each dataset respectively, therefore demonstrating that the presented system can robustly enrich the initial dataset.}, added-at = {2020-09-28T10:32:25.000+0200}, author = {Roig, Carlos and Varas, David and Masuda, Issey and Riveiro, Juan Carlos and Bou-Balust, Elisenda}, biburl = {https://www.bibsonomy.org/bibtex/25a423be623ec5649610b1989cc6457f4/parismic}, description = {[2005.05623] Unsupervised Multi-label Dataset Generation from Web Data}, interhash = {6c4aa306722f567af1aa8b29c404d60e}, intrahash = {5a423be623ec5649610b1989cc6457f4}, keywords = {data generation set web}, note = {cite arxiv:2005.05623Comment: The 3rd Workshop on Visual Understanding by Learning from Web Data 2019}, timestamp = {2020-09-28T10:32:25.000+0200}, title = {Unsupervised Multi-label Dataset Generation from Web Data}, url = {http://arxiv.org/abs/2005.05623}, year = 2020 }